xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 0e6a1e94b72fa840132b760bf3c9978fdb8bce49)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
19d0967f54SJacob Faibussowitsch #endif
20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
21a2cee5feSJed Brown #include <thrust/remove.h>
22a2cee5feSJed Brown #include <thrust/sort.h>
23a2cee5feSJed Brown #include <thrust/unique.h>
24e8d2b73aSMark Adams 
25b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations")
26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30afb2bd1cSJunchao Zhang 
31afb2bd1cSJunchao Zhang   typedef enum {
32afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
36afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
37afb2bd1cSJunchao Zhang 
38afb2bd1cSJunchao Zhang   typedef enum {
39afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
50afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
51afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
52afb2bd1cSJunchao Zhang 
53afb2bd1cSJunchao Zhang   typedef enum {
5435cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
5535cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
57afb2bd1cSJunchao Zhang   */
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61afb2bd1cSJunchao Zhang #endif
629ae82921SPaul Mullowney 
63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
666fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
67b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
70d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
716fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
72d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
73d460d7bfSJunchao Zhang #endif
74dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
776fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
786fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
849ae82921SPaul Mullowney 
857f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
882c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
897f756511SDominic Meiser 
9057181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9257181aedSStefano Zampini 
93c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
94e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
95219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
96c215019aSStefano Zampini 
97d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
98d71ae5a4SJacob Faibussowitsch {
99aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1006e111a19SKarl Rupp 
101ca45077fSPaul Mullowney   PetscFunctionBegin;
102ca45077fSPaul Mullowney   switch (op) {
103d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
104d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
105d71ae5a4SJacob Faibussowitsch     break;
106d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
107d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
108d71ae5a4SJacob Faibussowitsch     break;
109d71ae5a4SJacob Faibussowitsch   default:
110d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
111ca45077fSPaul Mullowney   }
1123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
113ca45077fSPaul Mullowney }
1149ae82921SPaul Mullowney 
115e057df02SPaul Mullowney /*@
11611a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11711a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
11811a5261eSBarry Smith 
119e057df02SPaul Mullowney   Not Collective
120e057df02SPaul Mullowney 
121e057df02SPaul Mullowney   Input Parameters:
12211a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
1232ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
1242ef1f0ffSBarry Smith         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12511a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
126e057df02SPaul Mullowney 
127e057df02SPaul Mullowney   Level: intermediate
128e057df02SPaul Mullowney 
129fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
130e057df02SPaul Mullowney @*/
131d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
132d71ae5a4SJacob Faibussowitsch {
133e057df02SPaul Mullowney   PetscFunctionBegin;
134e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
135cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
137e057df02SPaul Mullowney }
138e057df02SPaul Mullowney 
139d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
140d71ae5a4SJacob Faibussowitsch {
141365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
142365b711fSMark Adams 
143365b711fSMark Adams   PetscFunctionBegin;
144365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
146365b711fSMark Adams }
147365b711fSMark Adams 
148365b711fSMark Adams /*@
14911a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
150365b711fSMark Adams 
151365b711fSMark Adams   Input Parameters:
15211a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
15311a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
154365b711fSMark Adams 
1552ef1f0ffSBarry Smith   Level: intermediate
156365b711fSMark Adams 
15711a5261eSBarry Smith   Note:
158365b711fSMark Adams   The cuSparse LU solver currently computes the factors with the built-in CPU method
159365b711fSMark Adams   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
160365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
161365b711fSMark Adams 
1621cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
163365b711fSMark Adams @*/
164d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
165d71ae5a4SJacob Faibussowitsch {
166365b711fSMark Adams   PetscFunctionBegin;
167365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
168cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
170365b711fSMark Adams }
171365b711fSMark Adams 
17266976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
173d71ae5a4SJacob Faibussowitsch {
174e6e9a74fSStefano Zampini   PetscFunctionBegin;
1751a2c6b5cSJunchao Zhang   switch (op) {
1761a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1771a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1789566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1791a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1801a2c6b5cSJunchao Zhang     break;
181d71ae5a4SJacob Faibussowitsch   default:
182d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
183d71ae5a4SJacob Faibussowitsch     break;
184e6e9a74fSStefano Zampini   }
1853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
186e6e9a74fSStefano Zampini }
187e6e9a74fSStefano Zampini 
188d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
189d71ae5a4SJacob Faibussowitsch {
190e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1919ae82921SPaul Mullowney   PetscBool                flg;
192a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1936e111a19SKarl Rupp 
1949ae82921SPaul Mullowney   PetscFunctionBegin;
195d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1969ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1979371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1989566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
199afb2bd1cSJunchao Zhang 
2009371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2019566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2029566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2039566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
204afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2059371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
206afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
207b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
208aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
209a435da06SStefano Zampini   #else
210aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
211a435da06SStefano Zampini   #endif
2129371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
213aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
214afb2bd1cSJunchao Zhang 
2159371c9d4SSatish Balay     PetscCall(
2169371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
217aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
218afb2bd1cSJunchao Zhang #endif
2194c87dfd4SPaul Mullowney   }
220d0609cedSBarry Smith   PetscOptionsHeadEnd();
2213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2229ae82921SPaul Mullowney }
2239ae82921SPaul Mullowney 
224b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
225d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
226d460d7bfSJunchao Zhang {
227d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
228d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
229d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
230d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
231d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
232d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
233d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
234d460d7bfSJunchao Zhang 
235d460d7bfSJunchao Zhang   PetscFunctionBegin;
236d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
237d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
238d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
239d460d7bfSJunchao Zhang       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
240d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
241d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
242d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
243d460d7bfSJunchao Zhang       Mi[0] = 0;
244d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
245d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
246d460d7bfSJunchao Zhang         PetscInt ulen = Adiag[i] - Adiag[i + 1];
247d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
248d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
249d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
250d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
251d460d7bfSJunchao Zhang       }
252d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
253f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
254f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
256f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
257f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
258d460d7bfSJunchao Zhang 
259d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
260d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
261d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
262d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
263d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
264d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
265d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
266d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
267d460d7bfSJunchao Zhang 
268d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
269d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
270d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
271d460d7bfSJunchao Zhang 
272d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
273d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
274d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
275d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
276d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
277d460d7bfSJunchao Zhang 
278d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
279f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
280f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
281d460d7bfSJunchao Zhang 
282d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
283d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
284d460d7bfSJunchao Zhang 
285d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
286d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
287d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
288d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
289d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
290d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
291d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
292d460d7bfSJunchao Zhang 
293d460d7bfSJunchao Zhang       // Record for reuse
294d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
295d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
296d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
297d460d7bfSJunchao Zhang     }
298d460d7bfSJunchao Zhang     // Copy the value
299d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
300d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
301d460d7bfSJunchao Zhang     Mnz = Mi[m];
302d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
303d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
304d460d7bfSJunchao Zhang       PetscInt ulen = Adiag[i] - Adiag[i + 1];
305d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
306d460d7bfSJunchao Zhang       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
307d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
308d460d7bfSJunchao Zhang     }
309d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
310d460d7bfSJunchao Zhang 
311d460d7bfSJunchao Zhang     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
312d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
313d460d7bfSJunchao Zhang 
314d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
315d460d7bfSJunchao Zhang 
316d460d7bfSJunchao Zhang     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
317d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
318d460d7bfSJunchao Zhang   }
319d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
320d460d7bfSJunchao Zhang }
321d460d7bfSJunchao Zhang #else
322d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
323d71ae5a4SJacob Faibussowitsch {
3249ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3259ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3269ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
327aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3289ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3299ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3309ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3319ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3329ae82921SPaul Mullowney 
3339ae82921SPaul Mullowney   PetscFunctionBegin;
3343ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
335c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3369ae82921SPaul Mullowney     try {
3379ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3389ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
339da79fbbcSStefano Zampini       if (!loTriFactor) {
3402cbc15d9SMark         PetscScalar *AALo;
3412cbc15d9SMark 
3429566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3439ae82921SPaul Mullowney 
3449ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3459566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3469566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3479ae82921SPaul Mullowney 
3489ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3499ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3509ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3519ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3529ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3539ae82921SPaul Mullowney         v         = aa;
3549ae82921SPaul Mullowney         vi        = aj;
3559ae82921SPaul Mullowney         offset    = 1;
3569ae82921SPaul Mullowney         rowOffset = 1;
3579ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3589ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
359e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3609ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3619ae82921SPaul Mullowney           rowOffset += nz + 1;
3629ae82921SPaul Mullowney 
363f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
364f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3659ae82921SPaul Mullowney 
3669ae82921SPaul Mullowney           offset += nz;
3679ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3689ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3699ae82921SPaul Mullowney           offset += 1;
3709ae82921SPaul Mullowney 
3719ae82921SPaul Mullowney           v += nz;
3729ae82921SPaul Mullowney           vi += nz;
3739ae82921SPaul Mullowney         }
3742205254eSKarl Rupp 
375aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3769566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
377da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
378aa372e3fSPaul Mullowney         /* Create the matrix description */
3799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3811b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3829566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
383afb2bd1cSJunchao Zhang   #else
3849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
385afb2bd1cSJunchao Zhang   #endif
3869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
388aa372e3fSPaul Mullowney 
389aa372e3fSPaul Mullowney         /* set the operation */
390aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
391aa372e3fSPaul Mullowney 
392aa372e3fSPaul Mullowney         /* set the matrix */
393aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
394aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
395aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
396aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
397aa372e3fSPaul Mullowney 
398aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
399aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
400aa372e3fSPaul Mullowney 
401aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
402aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
403aa372e3fSPaul Mullowney 
404aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
405aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
406aa372e3fSPaul Mullowney 
407afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4089566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
409261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
4101b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4119371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4129371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
4139566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
414afb2bd1cSJunchao Zhang   #endif
415afb2bd1cSJunchao Zhang 
416aa372e3fSPaul Mullowney         /* perform the solve analysis */
4179371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4189f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4199566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4209566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
421aa372e3fSPaul Mullowney 
422da79fbbcSStefano Zampini         /* assign the pointer */
423aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4242cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4279566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
428da79fbbcSStefano Zampini       } else { /* update values only */
42948a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
430da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4312cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
432da79fbbcSStefano Zampini         v                    = aa;
433da79fbbcSStefano Zampini         vi                   = aj;
434da79fbbcSStefano Zampini         offset               = 1;
435da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
436da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
437f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
438da79fbbcSStefano Zampini           offset += nz;
4392cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
440da79fbbcSStefano Zampini           offset += 1;
441da79fbbcSStefano Zampini           v += nz;
442da79fbbcSStefano Zampini         }
4432cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4449566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
445da79fbbcSStefano Zampini       }
446d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
447d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
448d71ae5a4SJacob Faibussowitsch     }
4499ae82921SPaul Mullowney   }
4503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4519ae82921SPaul Mullowney }
4529ae82921SPaul Mullowney 
453d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
454d71ae5a4SJacob Faibussowitsch {
4559ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4569ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4579ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
458aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4599ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4609ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4619ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4629ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4639ae82921SPaul Mullowney 
4649ae82921SPaul Mullowney   PetscFunctionBegin;
4653ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
466c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4679ae82921SPaul Mullowney     try {
4689ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4699ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
470da79fbbcSStefano Zampini       if (!upTriFactor) {
4712cbc15d9SMark         PetscScalar *AAUp;
4722cbc15d9SMark 
4739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4742cbc15d9SMark 
4759ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4779566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4789ae82921SPaul Mullowney 
4799ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4809ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4819ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4829ae82921SPaul Mullowney         offset  = nzUpper;
4839ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4849ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4859ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4869ae82921SPaul Mullowney 
487e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4889ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4899ae82921SPaul Mullowney 
490e057df02SPaul Mullowney           /* decrement the offset */
4919ae82921SPaul Mullowney           offset -= (nz + 1);
4929ae82921SPaul Mullowney 
493e057df02SPaul Mullowney           /* first, set the diagonal elements */
4949ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
49509f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4969ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4979ae82921SPaul Mullowney 
498f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
499f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
5009ae82921SPaul Mullowney         }
5012205254eSKarl Rupp 
502aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5039566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
504da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5052205254eSKarl Rupp 
506aa372e3fSPaul Mullowney         /* Create the matrix description */
5079566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5089566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5091b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5109566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
511afb2bd1cSJunchao Zhang   #else
5129566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
513afb2bd1cSJunchao Zhang   #endif
5149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5159566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
516aa372e3fSPaul Mullowney 
517aa372e3fSPaul Mullowney         /* set the operation */
518aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
519aa372e3fSPaul Mullowney 
520aa372e3fSPaul Mullowney         /* set the matrix */
521aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
522aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
523aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
524aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
525aa372e3fSPaul Mullowney 
526aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
527aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
528aa372e3fSPaul Mullowney 
529aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
530aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
531aa372e3fSPaul Mullowney 
532aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
533aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
534aa372e3fSPaul Mullowney 
535afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5369566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
537261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5381b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5399371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5409371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5419566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
542afb2bd1cSJunchao Zhang   #endif
543afb2bd1cSJunchao Zhang 
544aa372e3fSPaul Mullowney         /* perform the solve analysis */
5459371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5469f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5479f7ba44dSJacob Faibussowitsch 
5489566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5499566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
550aa372e3fSPaul Mullowney 
551da79fbbcSStefano Zampini         /* assign the pointer */
552aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5532cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5569566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
557da79fbbcSStefano Zampini       } else {
55848a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
559da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
560da79fbbcSStefano Zampini         offset = nzUpper;
561da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
562da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
563da79fbbcSStefano Zampini 
564da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
565da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
566da79fbbcSStefano Zampini 
567da79fbbcSStefano Zampini           /* decrement the offset */
568da79fbbcSStefano Zampini           offset -= (nz + 1);
569da79fbbcSStefano Zampini 
570da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5712cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
572f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
573da79fbbcSStefano Zampini         }
5742cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5759566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
576da79fbbcSStefano Zampini       }
577d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
578d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
579d71ae5a4SJacob Faibussowitsch     }
5809ae82921SPaul Mullowney   }
5813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5829ae82921SPaul Mullowney }
583d460d7bfSJunchao Zhang #endif
5849ae82921SPaul Mullowney 
585d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
586d71ae5a4SJacob Faibussowitsch {
5879ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5889ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5899ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5909ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5919ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5929ae82921SPaul Mullowney 
5939ae82921SPaul Mullowney   PetscFunctionBegin;
59428b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
595b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
596d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
597d460d7bfSJunchao Zhang #else
5989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
600ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
601d460d7bfSJunchao Zhang #endif
602d460d7bfSJunchao Zhang 
603aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
6049ae82921SPaul Mullowney 
605d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
606e057df02SPaul Mullowney   /* lower triangular indices */
6079566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
608da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
609da79fbbcSStefano Zampini     const PetscInt *r;
610da79fbbcSStefano Zampini 
6119566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
612aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
613aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
6149566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
6159566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
616da79fbbcSStefano Zampini   }
6179ae82921SPaul Mullowney 
618e057df02SPaul Mullowney   /* upper triangular indices */
6199566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
620da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
621da79fbbcSStefano Zampini     const PetscInt *c;
622da79fbbcSStefano Zampini 
6239566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
624aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
625aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
6269566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
6279566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
628da79fbbcSStefano Zampini   }
6293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6309ae82921SPaul Mullowney }
6319ae82921SPaul Mullowney 
632b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
633d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
634d460d7bfSJunchao Zhang {
635d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
636d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
637d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
638d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
639d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
640d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
641d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
642d460d7bfSJunchao Zhang 
643d460d7bfSJunchao Zhang   PetscFunctionBegin;
644d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
645d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
646d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
647d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
648d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
649d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
650d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
651d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
652d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
653d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
654d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
655d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
656d460d7bfSJunchao Zhang       }
657d460d7bfSJunchao Zhang       // Copy M (U) from host to device
658f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
659f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
660f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
661f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
662d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
663d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
664d460d7bfSJunchao Zhang 
665d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
666d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
667d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
668d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
669d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
670d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
671d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
672d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
673d460d7bfSJunchao Zhang 
674d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
675d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
676d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
677d460d7bfSJunchao Zhang 
678d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
679f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
680f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
681d460d7bfSJunchao Zhang 
682d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
683d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
684d460d7bfSJunchao Zhang 
685d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
686d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
687d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
688d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
689d460d7bfSJunchao Zhang 
690aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
691d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
692d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
693d460d7bfSJunchao Zhang 
694d460d7bfSJunchao Zhang       // Record for reuse
695d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
696d460d7bfSJunchao Zhang       fs->diag_h   = D;
697d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
698d460d7bfSJunchao Zhang     }
699d460d7bfSJunchao Zhang     // Copy the value
700d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
701d460d7bfSJunchao Zhang     D   = fs->diag_h;
702d460d7bfSJunchao Zhang     Mnz = Ai[m];
703d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
704d460d7bfSJunchao Zhang       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
705d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
706d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
707d460d7bfSJunchao Zhang     }
708d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
709d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
710d460d7bfSJunchao Zhang 
711d460d7bfSJunchao Zhang     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
712d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
713d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
714d460d7bfSJunchao Zhang   }
715d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
716d460d7bfSJunchao Zhang }
717d460d7bfSJunchao Zhang 
718d460d7bfSJunchao Zhang // Solve Ut D U x = b
719d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
720d460d7bfSJunchao Zhang {
721d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
722d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
723d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
724d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
725d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
726d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
727d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
728d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
729d460d7bfSJunchao Zhang 
730d460d7bfSJunchao Zhang   PetscFunctionBegin;
731d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
732d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
733d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
734d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
735d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
736d460d7bfSJunchao Zhang 
737d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
738d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
739d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
740d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
741d460d7bfSJunchao Zhang   } else {
742d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
743d460d7bfSJunchao Zhang   }
744d460d7bfSJunchao Zhang 
745d460d7bfSJunchao Zhang   // Solve Ut Y = X
746d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
747d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
748d460d7bfSJunchao Zhang 
749d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
750d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
751d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
752d460d7bfSJunchao Zhang 
753d460d7bfSJunchao Zhang   // Solve U X = Y
754d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
755d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
756d460d7bfSJunchao Zhang   } else {
757d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
758d460d7bfSJunchao Zhang   }
759d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
760d460d7bfSJunchao Zhang 
761d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
762d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
763d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
764d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
765d460d7bfSJunchao Zhang   }
766d460d7bfSJunchao Zhang 
767d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
768d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
769d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
770d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
771d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
772d460d7bfSJunchao Zhang }
773d460d7bfSJunchao Zhang #else
774d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
775d71ae5a4SJacob Faibussowitsch {
776087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
777087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
778aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
779aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
780087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
781087f3262SPaul Mullowney   PetscScalar                       *AAUp;
782087f3262SPaul Mullowney   PetscScalar                       *AALo;
783087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
784087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
785087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
786087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
787087f3262SPaul Mullowney 
788087f3262SPaul Mullowney   PetscFunctionBegin;
7893ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
790c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
791087f3262SPaul Mullowney     try {
7929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
7939566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
794da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
795087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
7979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
798087f3262SPaul Mullowney 
799087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
800087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
801087f3262SPaul Mullowney         AiUp[n] = nzUpper;
802087f3262SPaul Mullowney         offset  = 0;
803087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
804087f3262SPaul Mullowney           /* set the pointers */
805087f3262SPaul Mullowney           v  = aa + ai[i];
806087f3262SPaul Mullowney           vj = aj + ai[i];
807087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
808087f3262SPaul Mullowney 
809087f3262SPaul Mullowney           /* first, set the diagonal elements */
810087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
81109f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
812087f3262SPaul Mullowney           AiUp[i]      = offset;
81309f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
814087f3262SPaul Mullowney 
815087f3262SPaul Mullowney           offset += 1;
816087f3262SPaul Mullowney           if (nz > 0) {
817f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
818f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
819087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
820087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
821087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
822087f3262SPaul Mullowney             }
823087f3262SPaul Mullowney             offset += nz;
824087f3262SPaul Mullowney           }
825087f3262SPaul Mullowney         }
826087f3262SPaul Mullowney 
827aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8289566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
829da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
830087f3262SPaul Mullowney 
831aa372e3fSPaul Mullowney         /* Create the matrix description */
8329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8341b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8359566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
836afb2bd1cSJunchao Zhang   #else
8379566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
838afb2bd1cSJunchao Zhang   #endif
8399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8409566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
841087f3262SPaul Mullowney 
842aa372e3fSPaul Mullowney         /* set the matrix */
843aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
846aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
847aa372e3fSPaul Mullowney 
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
849aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
850aa372e3fSPaul Mullowney 
851aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
852aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
853aa372e3fSPaul Mullowney 
854aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
855aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
856aa372e3fSPaul Mullowney 
857afb2bd1cSJunchao Zhang         /* set the operation */
858afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859afb2bd1cSJunchao Zhang 
860afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8619566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
862261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8631b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8649371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8659371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8669566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
867afb2bd1cSJunchao Zhang   #endif
868afb2bd1cSJunchao Zhang 
869aa372e3fSPaul Mullowney         /* perform the solve analysis */
8709371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8719f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8729f7ba44dSJacob Faibussowitsch 
8739566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8749566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
875aa372e3fSPaul Mullowney 
876da79fbbcSStefano Zampini         /* assign the pointer */
877aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
878aa372e3fSPaul Mullowney 
879aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8809566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
881da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
882aa372e3fSPaul Mullowney 
883aa372e3fSPaul Mullowney         /* Create the matrix description */
8849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8859566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8861b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
888afb2bd1cSJunchao Zhang   #else
8899566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
890afb2bd1cSJunchao Zhang   #endif
8919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
893aa372e3fSPaul Mullowney 
894aa372e3fSPaul Mullowney         /* set the operation */
895aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
896aa372e3fSPaul Mullowney 
897aa372e3fSPaul Mullowney         /* set the matrix */
898aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
899aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
900aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
901aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
902aa372e3fSPaul Mullowney 
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
904aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
905aa372e3fSPaul Mullowney 
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
907aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
908aa372e3fSPaul Mullowney 
909aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
910aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
911aa372e3fSPaul Mullowney 
912afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9139566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
914261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9151b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9169371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9179371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9189566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
919afb2bd1cSJunchao Zhang   #endif
920afb2bd1cSJunchao Zhang 
921aa372e3fSPaul Mullowney         /* perform the solve analysis */
9229371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9239f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9249f7ba44dSJacob Faibussowitsch 
9259566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9269566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
927aa372e3fSPaul Mullowney 
928da79fbbcSStefano Zampini         /* assign the pointer */
929aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
930087f3262SPaul Mullowney 
9319566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9329566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9339566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
934da79fbbcSStefano Zampini       } else {
935da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
936da79fbbcSStefano Zampini         offset = 0;
937da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
938da79fbbcSStefano Zampini           /* set the pointers */
939da79fbbcSStefano Zampini           v  = aa + ai[i];
940da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
941da79fbbcSStefano Zampini 
942da79fbbcSStefano Zampini           /* first, set the diagonal elements */
943da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
944da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
945da79fbbcSStefano Zampini 
946da79fbbcSStefano Zampini           offset += 1;
947da79fbbcSStefano Zampini           if (nz > 0) {
948f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
949da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
950da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
951da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
952da79fbbcSStefano Zampini             }
953da79fbbcSStefano Zampini             offset += nz;
954da79fbbcSStefano Zampini           }
955da79fbbcSStefano Zampini         }
95628b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
95728b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
958da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
959da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9609566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
961da79fbbcSStefano Zampini       }
9629566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9639566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
964d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
965d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
966d71ae5a4SJacob Faibussowitsch     }
967087f3262SPaul Mullowney   }
9683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
969087f3262SPaul Mullowney }
970d460d7bfSJunchao Zhang #endif
971087f3262SPaul Mullowney 
972d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
973d71ae5a4SJacob Faibussowitsch {
974087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
975087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
976087f3262SPaul Mullowney   IS                            ip                 = a->row;
977087f3262SPaul Mullowney   PetscBool                     perm_identity;
978087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
979087f3262SPaul Mullowney 
980087f3262SPaul Mullowney   PetscFunctionBegin;
98128b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
982d460d7bfSJunchao Zhang 
983b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
984d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
985d460d7bfSJunchao Zhang #else
9869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
987ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
988d460d7bfSJunchao Zhang #endif
989aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
990aa372e3fSPaul Mullowney 
991da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
992da79fbbcSStefano Zampini 
993087f3262SPaul Mullowney   /* lower triangular indices */
9949566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
995087f3262SPaul Mullowney   if (!perm_identity) {
9964e4bbfaaSStefano Zampini     IS              iip;
997da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
9984e4bbfaaSStefano Zampini 
9999566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
10009566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
10019566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
1002aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1003aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1004aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10054e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10069566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10079566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10089566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10099566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1010da79fbbcSStefano Zampini   }
10113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1012087f3262SPaul Mullowney }
1013087f3262SPaul Mullowney 
1014d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1015d71ae5a4SJacob Faibussowitsch {
1016087f3262SPaul Mullowney   PetscFunctionBegin;
10179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10189566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1019ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1020d460d7bfSJunchao Zhang 
1021b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1022d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1024d460d7bfSJunchao Zhang #else
1025087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1026d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1027d460d7bfSJunchao Zhang   IS          ip = b->row;
1028d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1029d460d7bfSJunchao Zhang 
10309566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1031087f3262SPaul Mullowney   if (perm_identity) {
1032087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1033087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1034087f3262SPaul Mullowney   } else {
1035087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1036087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1037d460d7bfSJunchao Zhang   }
1038d460d7bfSJunchao Zhang #endif
10394e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10404e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1041087f3262SPaul Mullowney 
1042087f3262SPaul Mullowney   /* get the triangular factors */
10439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1045087f3262SPaul Mullowney }
10469ae82921SPaul Mullowney 
1047b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1048d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1049d71ae5a4SJacob Faibussowitsch {
1050bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1051aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1052aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1053da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1054da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1055aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1056aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1057aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1058aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1059b175d8bbSPaul Mullowney 
1060bda325fcSPaul Mullowney   PetscFunctionBegin;
1061aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10629566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1063da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1064aa372e3fSPaul Mullowney 
1065aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1066aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1067aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10689371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1069aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1070aa372e3fSPaul Mullowney 
1071aa372e3fSPaul Mullowney   /* Create the matrix description */
10729566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10739566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10749566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10759566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10769566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1077aa372e3fSPaul Mullowney 
1078aa372e3fSPaul Mullowney   /* set the operation */
1079aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1080aa372e3fSPaul Mullowney 
1081aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1082aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1083afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1084afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1085aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1086afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1087afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1088afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1089aa372e3fSPaul Mullowney 
1090aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1091afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10929371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
10939371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10949371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10959566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1096afb2bd1cSJunchao Zhang   #endif
1097afb2bd1cSJunchao Zhang 
10989566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10999f7ba44dSJacob Faibussowitsch   {
11009f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11019f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
11029371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1103afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11049f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1105afb2bd1cSJunchao Zhang   #else
11069f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1107afb2bd1cSJunchao Zhang   #endif
11089f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11099f7ba44dSJacob Faibussowitsch   }
11109f7ba44dSJacob Faibussowitsch 
11119566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11129566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1113aa372e3fSPaul Mullowney 
1114afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11159566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1116261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11171b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11189371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11199371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11209566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1121afb2bd1cSJunchao Zhang   #endif
1122afb2bd1cSJunchao Zhang 
1123afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11249371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11259f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11269f7ba44dSJacob Faibussowitsch 
11279566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11289566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1129aa372e3fSPaul Mullowney 
1130da79fbbcSStefano Zampini   /* assign the pointer */
1131aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1132aa372e3fSPaul Mullowney 
1133aa372e3fSPaul Mullowney   /*********************************************/
1134aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1135aa372e3fSPaul Mullowney   /*********************************************/
1136aa372e3fSPaul Mullowney 
1137aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11389566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1139da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1140aa372e3fSPaul Mullowney 
1141aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1142aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1143aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11449371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1145aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1146aa372e3fSPaul Mullowney 
1147aa372e3fSPaul Mullowney   /* Create the matrix description */
11489566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11499566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11509566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11519566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11529566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1153aa372e3fSPaul Mullowney 
1154aa372e3fSPaul Mullowney   /* set the operation */
1155aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1156aa372e3fSPaul Mullowney 
1157aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1158aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1159afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1160afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1161aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1162afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1163afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1164afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1165aa372e3fSPaul Mullowney 
1166aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1167afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11689371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11699371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11709371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11719566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1172afb2bd1cSJunchao Zhang   #endif
1173afb2bd1cSJunchao Zhang 
11749566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11759f7ba44dSJacob Faibussowitsch   {
11769f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11779f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11789371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1179afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11809f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1181afb2bd1cSJunchao Zhang   #else
11829f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1183afb2bd1cSJunchao Zhang   #endif
11849f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11859f7ba44dSJacob Faibussowitsch   }
1186d49cd2b7SBarry Smith 
11879566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11889566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1189aa372e3fSPaul Mullowney 
1190afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11919566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1192261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
11931b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11949371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11959371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
11969566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1197afb2bd1cSJunchao Zhang   #endif
1198afb2bd1cSJunchao Zhang 
1199afb2bd1cSJunchao Zhang   /* perform the solve analysis */
12005f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
12019371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12029f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1203d49cd2b7SBarry Smith 
12049566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12059566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1206aa372e3fSPaul Mullowney 
1207da79fbbcSStefano Zampini   /* assign the pointer */
1208aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1210bda325fcSPaul Mullowney }
1211d460d7bfSJunchao Zhang #endif
1212bda325fcSPaul Mullowney 
12139371c9d4SSatish Balay struct PetscScalarToPetscInt {
12149371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1215a49f1ed0SStefano Zampini };
1216a49f1ed0SStefano Zampini 
1217d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1218d71ae5a4SJacob Faibussowitsch {
1219aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1220a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1221bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1222bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1223aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1224b175d8bbSPaul Mullowney 
1225bda325fcSPaul Mullowney   PetscFunctionBegin;
12269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1227a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
122828b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1229a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
123008401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12313ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12329566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12339566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
123448a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1235a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12379566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1238aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12399566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12409566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1241aa372e3fSPaul Mullowney 
1242b06137fdSPaul Mullowney     /* set alpha and beta */
1243f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1244f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1245f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1249b06137fdSPaul Mullowney 
1250aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1252a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1253554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1254554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1255aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1256a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1257aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1259a3fdcf43SKarl Rupp 
1260ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
126181902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1262afb2bd1cSJunchao Zhang 
1263afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12643606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12659371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12669371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12679371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12683606e59fSJunchao Zhang   #else
12693606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12703606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12713606e59fSJunchao Zhang 
12723606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12733606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12743606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12753606e59fSJunchao Zhang         */
12763606e59fSJunchao Zhang       if (matrixT->num_entries) {
12779371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12789371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12793606e59fSJunchao Zhang 
12803606e59fSJunchao Zhang       } else {
12813606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12823606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
12833606e59fSJunchao Zhang       }
12843606e59fSJunchao Zhang   #endif
1285afb2bd1cSJunchao Zhang #endif
1286aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1287afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1288afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1289afb2bd1cSJunchao Zhang #else
1290aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
129151c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
129251c6d536SStefano Zampini       /* First convert HYB to CSR */
1293aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1294aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1295aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1296aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1297aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1298aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1299aa372e3fSPaul Mullowney 
13009371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
13019371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1302aa372e3fSPaul Mullowney 
1303aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1304aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1305aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1306aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1307aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1308aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1309aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1310aa372e3fSPaul Mullowney 
13119371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13129371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13139371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1314aa372e3fSPaul Mullowney 
1315aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1316aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13179566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13189371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13199371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13209371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1321aa372e3fSPaul Mullowney 
1322aa372e3fSPaul Mullowney       /* assign the pointer */
1323aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13241a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1325aa372e3fSPaul Mullowney       /* delete temporaries */
1326aa372e3fSPaul Mullowney       if (tempT) {
1327aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1328aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1329aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1330aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1331087f3262SPaul Mullowney       }
1332aa372e3fSPaul Mullowney       if (temp) {
1333aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1334aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1335aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1336aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1337aa372e3fSPaul Mullowney       }
1338afb2bd1cSJunchao Zhang #endif
1339aa372e3fSPaul Mullowney     }
1340a49f1ed0SStefano Zampini   }
1341a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1342a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1343a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
134428b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
134528b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
134628b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
134728b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
134828b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
134928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
135028b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
135128b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1352a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1353a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1354a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13559566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1356a49f1ed0SStefano Zampini     }
1357a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1358a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1359792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1360a49f1ed0SStefano Zampini 
1361a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1362a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1363a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1364a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13659371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13669371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13679371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13689566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1369a49f1ed0SStefano Zampini #endif
1370a49f1ed0SStefano Zampini 
13711a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13721a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13731a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13741a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13751a2c6b5cSJunchao Zhang 
13761a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13771a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13781a2c6b5cSJunchao Zhang         */
13799371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1380a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13819371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13829371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1383a49f1ed0SStefano Zampini #else
13849371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13859371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1386a49f1ed0SStefano Zampini #endif
13871a2c6b5cSJunchao Zhang       } else {
13881a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13891a2c6b5cSJunchao Zhang       }
13901a2c6b5cSJunchao Zhang 
1391a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1392792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1393a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13949566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1395a49f1ed0SStefano Zampini #endif
1396a49f1ed0SStefano Zampini     }
13979371c9d4SSatish Balay     PetscCallThrust(
13989371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1399a49f1ed0SStefano Zampini   }
14009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14019566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1402213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1403213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1404aa372e3fSPaul Mullowney   /* assign the pointer */
1405aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14061a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1408bda325fcSPaul Mullowney }
1409bda325fcSPaul Mullowney 
1410b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1411d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1412d460d7bfSJunchao Zhang {
1413d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1414d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1415d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1416d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1417d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1418d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1419d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1420d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1421d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1422d460d7bfSJunchao Zhang 
1423d460d7bfSJunchao Zhang   PetscFunctionBegin;
1424d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1425d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1426d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1427d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1428d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1429d460d7bfSJunchao Zhang 
1430d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1431d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1432d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1433d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1434d460d7bfSJunchao Zhang   } else {
1435d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1436d460d7bfSJunchao Zhang   }
1437d460d7bfSJunchao Zhang 
1438d460d7bfSJunchao Zhang   // Solve L Y = X
1439d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1440d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1441d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1442d460d7bfSJunchao Zhang 
1443d460d7bfSJunchao Zhang   // Solve U X = Y
1444d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1445d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1446d460d7bfSJunchao Zhang   } else {
1447d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1448d460d7bfSJunchao Zhang   }
1449d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1450d460d7bfSJunchao Zhang 
1451d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1452d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1453d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1454d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1455d460d7bfSJunchao Zhang   }
1456d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1457d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1458d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1459d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1460d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1461d460d7bfSJunchao Zhang }
1462d460d7bfSJunchao Zhang 
1463d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1464d460d7bfSJunchao Zhang {
1465d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1466d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1467d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1468d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1469d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1470d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1471d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1472d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1473d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1474d460d7bfSJunchao Zhang 
1475d460d7bfSJunchao Zhang   PetscFunctionBegin;
1476d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1477d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1478d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1479d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1480d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1481d460d7bfSJunchao Zhang 
1482d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1483d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1484d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1485d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1486d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1487d460d7bfSJunchao Zhang   }
1488d460d7bfSJunchao Zhang 
1489d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1490d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1491d460d7bfSJunchao Zhang 
1492d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1493d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1494d460d7bfSJunchao Zhang   }
1495d460d7bfSJunchao Zhang 
1496d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1497d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1498d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1499d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1500d460d7bfSJunchao Zhang 
1501d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1502d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1503d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1504d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1505d460d7bfSJunchao Zhang   } else {
1506d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1507d460d7bfSJunchao Zhang   }
1508d460d7bfSJunchao Zhang 
1509d460d7bfSJunchao Zhang   // Solve Ut Y = X
1510d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1511d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1512d460d7bfSJunchao Zhang 
1513d460d7bfSJunchao Zhang   // Solve Lt X = Y
1514d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1515d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1516d460d7bfSJunchao Zhang   } else {
1517d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1518d460d7bfSJunchao Zhang   }
1519d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1520d460d7bfSJunchao Zhang 
1521d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1522d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1523d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1524d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1525d460d7bfSJunchao Zhang   }
1526d460d7bfSJunchao Zhang 
1527d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1528d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1529d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1530d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1531d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1532d460d7bfSJunchao Zhang }
1533d460d7bfSJunchao Zhang #else
1534a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1535d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1536d71ae5a4SJacob Faibussowitsch {
1537c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1538465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1539465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1540465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1541465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1542bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1543aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1544aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1545aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1546bda325fcSPaul Mullowney 
1547bda325fcSPaul Mullowney   PetscFunctionBegin;
1548aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1549aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15509566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1551aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1552aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1553bda325fcSPaul Mullowney   }
1554bda325fcSPaul Mullowney 
1555bda325fcSPaul Mullowney   /* Get the GPU pointers */
15569566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15579566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1558c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1559c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1560bda325fcSPaul Mullowney 
15619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1562aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15639371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1564aa372e3fSPaul Mullowney 
1565aa372e3fSPaul Mullowney   /* First, solve U */
15669f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15679f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1568aa372e3fSPaul Mullowney 
1569aa372e3fSPaul Mullowney   /* Then, solve L */
15709f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15719f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1572aa372e3fSPaul Mullowney 
1573aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15749371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1575aa372e3fSPaul Mullowney 
1576aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1577a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1578bda325fcSPaul Mullowney 
1579bda325fcSPaul Mullowney   /* restore */
15809566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15819566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
15843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1585bda325fcSPaul Mullowney }
1586bda325fcSPaul Mullowney 
1587d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1588d71ae5a4SJacob Faibussowitsch {
1589465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1590465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1591bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1593aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1594aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1595bda325fcSPaul Mullowney 
1596bda325fcSPaul Mullowney   PetscFunctionBegin;
1597aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1598aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15999566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1600aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1601aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1602bda325fcSPaul Mullowney   }
1603bda325fcSPaul Mullowney 
1604bda325fcSPaul Mullowney   /* Get the GPU pointers */
16059566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16069566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1607bda325fcSPaul Mullowney 
16089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1609aa372e3fSPaul Mullowney   /* First, solve U */
16109f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16119f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1612aa372e3fSPaul Mullowney 
1613aa372e3fSPaul Mullowney   /* Then, solve L */
16149f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16159f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1616bda325fcSPaul Mullowney 
1617bda325fcSPaul Mullowney   /* restore */
16189566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16199566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16219566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1623bda325fcSPaul Mullowney }
1624bda325fcSPaul Mullowney 
1625d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1626d71ae5a4SJacob Faibussowitsch {
1627465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1628465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1629465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1630465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16319ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1632aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1633aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1634aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16359ae82921SPaul Mullowney 
16369ae82921SPaul Mullowney   PetscFunctionBegin;
1637e057df02SPaul Mullowney   /* Get the GPU pointers */
16389566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16399566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1640c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1641c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16429ae82921SPaul Mullowney 
16439566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1644aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16459371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1646aa372e3fSPaul Mullowney 
1647aa372e3fSPaul Mullowney   /* Next, solve L */
16489f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16499f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1650aa372e3fSPaul Mullowney 
1651aa372e3fSPaul Mullowney   /* Then, solve U */
16529f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16539f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1654d49cd2b7SBarry Smith 
16554e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16569371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16579ae82921SPaul Mullowney 
16589566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16599566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16619566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16639ae82921SPaul Mullowney }
16649ae82921SPaul Mullowney 
1665d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1666d71ae5a4SJacob Faibussowitsch {
1667465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1668465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16699ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1670aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1671aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1672aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16739ae82921SPaul Mullowney 
16749ae82921SPaul Mullowney   PetscFunctionBegin;
1675e057df02SPaul Mullowney   /* Get the GPU pointers */
16769566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16779566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16789ae82921SPaul Mullowney 
16799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1680aa372e3fSPaul Mullowney   /* First, solve L */
16819f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16829f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1683d49cd2b7SBarry Smith 
1684aa372e3fSPaul Mullowney   /* Next, solve U */
16859f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16869f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
16879ae82921SPaul Mullowney 
16889566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16899566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16939ae82921SPaul Mullowney }
1694d460d7bfSJunchao Zhang #endif
16959ae82921SPaul Mullowney 
1696b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
16978eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1698d71ae5a4SJacob Faibussowitsch {
1699da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1700da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1701da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1702da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1703da112707SJunchao Zhang   PetscInt                      m, nz;
1704da112707SJunchao Zhang   PetscBool                     flg;
1705da112707SJunchao Zhang 
1706da112707SJunchao Zhang   PetscFunctionBegin;
1707da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1708da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1709da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1710da112707SJunchao Zhang   }
1711da112707SJunchao Zhang 
1712da112707SJunchao Zhang   /* Copy A's value to fact */
1713da112707SJunchao Zhang   m  = fact->rmap->n;
1714da112707SJunchao Zhang   nz = aij->nz;
1715da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1716da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1717da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1718da112707SJunchao Zhang 
1719bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1720da112707SJunchao Zhang   /* Factorize fact inplace */
17219371c9d4SSatish Balay   if (m)
17229371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1723d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1724da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1725da112707SJunchao Zhang     int              numerical_zero;
1726da112707SJunchao Zhang     cusparseStatus_t status;
1727da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1728da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1729da112707SJunchao Zhang   }
1730da112707SJunchao Zhang 
173112ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
173212ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
173312ba2bc6SJunchao Zhang   */
17349371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1735da112707SJunchao Zhang 
17369371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1737da112707SJunchao Zhang 
173812ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
173912ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
174012ba2bc6SJunchao Zhang 
1741da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1742d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1743d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1744da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1745da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1746bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1747da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1749da112707SJunchao Zhang }
1750da112707SJunchao Zhang 
17518eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1752d71ae5a4SJacob Faibussowitsch {
1753da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1754da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1755da112707SJunchao Zhang   PetscInt                      m, nz;
1756da112707SJunchao Zhang 
1757da112707SJunchao Zhang   PetscFunctionBegin;
1758da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1759da112707SJunchao Zhang     PetscInt  i;
1760da112707SJunchao Zhang     PetscBool flg, missing;
1761da112707SJunchao Zhang 
1762da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1763da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1764da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1765da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1766da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1767da112707SJunchao Zhang   }
1768da112707SJunchao Zhang 
1769da112707SJunchao Zhang   /* Free the old stale stuff */
1770da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1771da112707SJunchao Zhang 
1772da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1773da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1774da112707SJunchao Zhang    */
1775da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1776da112707SJunchao Zhang 
1777da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1778da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1779da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1780da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1781da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1782da112707SJunchao Zhang 
1783da112707SJunchao Zhang   aij->row = NULL;
1784da112707SJunchao Zhang   aij->col = NULL;
1785da112707SJunchao Zhang 
1786da112707SJunchao Zhang   /* ====================================================================== */
1787da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1788da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1789da112707SJunchao Zhang   /* ====================================================================== */
1790da112707SJunchao Zhang   const int *Ai, *Aj;
1791da112707SJunchao Zhang 
1792da112707SJunchao Zhang   m  = fact->rmap->n;
1793da112707SJunchao Zhang   nz = aij->nz;
1794da112707SJunchao Zhang 
1795f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1796f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1797f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1798d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1799d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1800d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1801da112707SJunchao Zhang 
1802da112707SJunchao Zhang   /* ====================================================================== */
1803da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1804da112707SJunchao Zhang   /* ====================================================================== */
1805da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1806da112707SJunchao Zhang   cusparseDiagType_t diagType;
1807da112707SJunchao Zhang 
1808da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1809da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1810da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1811da112707SJunchao Zhang 
1812da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1813da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1814da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1815da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1816da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1817da112707SJunchao Zhang   */
1818da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1819da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1820d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18219371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18229371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1823da112707SJunchao Zhang 
1824da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1825da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1826d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18279371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18289371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1829da112707SJunchao Zhang 
1830da112707SJunchao Zhang   /* ========================================================================= */
1831da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1832da112707SJunchao Zhang   /* ========================================================================= */
1833da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18349371c9d4SSatish Balay   if (m)
18359371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1836d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1837da112707SJunchao Zhang 
1838da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1839da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1840da112707SJunchao Zhang 
1841da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1842da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1843da112707SJunchao Zhang 
1844da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18459371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1846da112707SJunchao Zhang 
1847da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18489371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1849da112707SJunchao Zhang 
1850da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
185112ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
185212ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
185312ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1854da112707SJunchao Zhang    */
185512ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
185612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
185712ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1858da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
185912ba2bc6SJunchao Zhang   } else {
186012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
186112ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1862da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
186312ba2bc6SJunchao Zhang   }
1864da112707SJunchao Zhang 
1865da112707SJunchao Zhang   /* ========================================================================== */
1866da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1867da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1868da112707SJunchao Zhang   /* ========================================================================== */
1869da112707SJunchao Zhang   int              structural_zero;
1870da112707SJunchao Zhang   cusparseStatus_t status;
1871da112707SJunchao Zhang 
1872da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18739371c9d4SSatish Balay   if (m)
18749371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1875d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1876da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1877da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1878da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1879da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1880da112707SJunchao Zhang   }
1881da112707SJunchao Zhang 
1882da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18830dd8c0acSJunchao Zhang   {
1884da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18850dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1886da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1887da112707SJunchao Zhang 
1888da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1889da112707SJunchao Zhang     Ai    = Aseq->i;
1890da112707SJunchao Zhang     Adiag = Aseq->diag;
1891da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1892da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1893da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1894da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1895da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1896da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1897da112707SJunchao Zhang         */
1898da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1899da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1900da112707SJunchao Zhang       }
1901da112707SJunchao Zhang     }
1902da112707SJunchao Zhang     fs->numericFactFlops = flops;
19030dd8c0acSJunchao Zhang   }
1904da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1906da112707SJunchao Zhang }
1907da112707SJunchao Zhang 
1908d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1909d71ae5a4SJacob Faibussowitsch {
1910da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1911da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1912da112707SJunchao Zhang   const PetscScalar            *barray;
1913da112707SJunchao Zhang   PetscScalar                  *xarray;
1914da112707SJunchao Zhang 
1915da112707SJunchao Zhang   PetscFunctionBegin;
1916da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1917da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1918da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1919da112707SJunchao Zhang 
1920da112707SJunchao Zhang   /* Solve L*y = b */
1921da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1922da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19239371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19249371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1925da112707SJunchao Zhang 
1926da112707SJunchao Zhang   /* Solve Lt*x = y */
1927da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19289371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19299371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1930da112707SJunchao Zhang 
1931da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1932da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1933da112707SJunchao Zhang 
1934da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1935da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1937da112707SJunchao Zhang }
1938da112707SJunchao Zhang 
19398eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1940d71ae5a4SJacob Faibussowitsch {
1941da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1942da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1943da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1944da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1945da112707SJunchao Zhang   PetscInt                      m, nz;
1946da112707SJunchao Zhang   PetscBool                     flg;
1947da112707SJunchao Zhang 
1948da112707SJunchao Zhang   PetscFunctionBegin;
1949da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1950da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1951da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1952da112707SJunchao Zhang   }
1953da112707SJunchao Zhang 
1954da112707SJunchao Zhang   /* Copy A's value to fact */
1955da112707SJunchao Zhang   m  = fact->rmap->n;
1956da112707SJunchao Zhang   nz = aij->nz;
1957da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1958da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1959da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1960da112707SJunchao Zhang 
1961da112707SJunchao Zhang   /* Factorize fact inplace */
1962da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1963da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1964da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1965da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1966da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1967da112707SJunchao Zhang    */
1968d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1969da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1970da112707SJunchao Zhang     int              numerical_zero;
1971da112707SJunchao Zhang     cusparseStatus_t status;
1972da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1973da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1974da112707SJunchao Zhang   }
1975da112707SJunchao Zhang 
19769371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1977da112707SJunchao Zhang 
1978da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1979da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1980da112707SJunchao Zhang   */
19819371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1982da112707SJunchao Zhang 
1983da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1984da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1985da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1986da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1987da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1988da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
19893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1990da112707SJunchao Zhang }
1991da112707SJunchao Zhang 
19928eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1993d71ae5a4SJacob Faibussowitsch {
1994da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1995da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1996da112707SJunchao Zhang   PetscInt                      m, nz;
1997da112707SJunchao Zhang 
1998da112707SJunchao Zhang   PetscFunctionBegin;
1999da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2000da112707SJunchao Zhang     PetscInt  i;
2001da112707SJunchao Zhang     PetscBool flg, missing;
2002da112707SJunchao Zhang 
2003da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2004da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2005da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2006da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
2007da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2008da112707SJunchao Zhang   }
2009da112707SJunchao Zhang 
2010da112707SJunchao Zhang   /* Free the old stale stuff */
2011da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2012da112707SJunchao Zhang 
2013da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2014da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2015da112707SJunchao Zhang    */
2016da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2017da112707SJunchao Zhang 
2018da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2019da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2020da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2021da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2022da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2023da112707SJunchao Zhang 
2024da112707SJunchao Zhang   aij->row = NULL;
2025da112707SJunchao Zhang   aij->col = NULL;
2026da112707SJunchao Zhang 
2027da112707SJunchao Zhang   /* ====================================================================== */
2028da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2029da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2030da112707SJunchao Zhang   /* ====================================================================== */
2031da112707SJunchao Zhang   const int *Ai, *Aj;
2032da112707SJunchao Zhang 
2033da112707SJunchao Zhang   m  = fact->rmap->n;
2034da112707SJunchao Zhang   nz = aij->nz;
2035da112707SJunchao Zhang 
2036f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2037f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2038da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2039da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2040d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2041d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2042da112707SJunchao Zhang 
2043da112707SJunchao Zhang   /* ====================================================================== */
2044da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2045da112707SJunchao Zhang   /* ====================================================================== */
2046da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2047da112707SJunchao Zhang   cusparseDiagType_t diagType;
2048da112707SJunchao Zhang 
2049da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2050da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2051da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2052da112707SJunchao Zhang 
2053da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2054da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2055da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2056da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2057da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2058da112707SJunchao Zhang   */
2059da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2060da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2061d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20629371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20639371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2064da112707SJunchao Zhang 
2065da112707SJunchao Zhang   /* ========================================================================= */
2066da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2067da112707SJunchao Zhang   /* ========================================================================= */
2068da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2069d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2070da112707SJunchao Zhang 
2071da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2072da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2073da112707SJunchao Zhang 
2074da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2075da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2076da112707SJunchao Zhang 
2077da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
20789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2079da112707SJunchao Zhang 
2080da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
20819371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2082da112707SJunchao Zhang 
208312ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
208412ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
208512ba2bc6SJunchao Zhang    */
208612ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
208712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
208812ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2089da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
209012ba2bc6SJunchao Zhang   } else {
209112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
209212ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
209312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
209412ba2bc6SJunchao Zhang   }
2095da112707SJunchao Zhang 
2096da112707SJunchao Zhang   /* ========================================================================== */
2097da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2098da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2099da112707SJunchao Zhang   /* ========================================================================== */
2100da112707SJunchao Zhang   int              structural_zero;
2101da112707SJunchao Zhang   cusparseStatus_t status;
2102da112707SJunchao Zhang 
2103da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2104d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2105da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2106da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2107da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2108da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2109da112707SJunchao Zhang   }
2110da112707SJunchao Zhang 
2111da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21120dd8c0acSJunchao Zhang   {
2113da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21140dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2115da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2116da112707SJunchao Zhang 
2117da112707SJunchao Zhang     Ai = Aseq->i;
2118da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2119da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2120da112707SJunchao Zhang       if (nzRow > 1) {
2121da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2122da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2123da112707SJunchao Zhang         */
2124da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2125da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2126da112707SJunchao Zhang       }
2127da112707SJunchao Zhang     }
2128da112707SJunchao Zhang     fs->numericFactFlops = flops;
21290dd8c0acSJunchao Zhang   }
2130da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2132da112707SJunchao Zhang }
2133da112707SJunchao Zhang #endif
2134da112707SJunchao Zhang 
2135d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2136d460d7bfSJunchao Zhang {
2137b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2138b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2139d460d7bfSJunchao Zhang 
2140d460d7bfSJunchao Zhang   PetscFunctionBegin;
2141d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2142d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2143d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2144d460d7bfSJunchao Zhang 
2145d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2146b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2147d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2148d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2149d460d7bfSJunchao Zhang #else
2150d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2151d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2152d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2153d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2154d460d7bfSJunchao Zhang 
2155d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2156d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2157d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2158d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2159d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2160d460d7bfSJunchao Zhang     } else {
2161d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2162d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2163d460d7bfSJunchao Zhang     }
2164d460d7bfSJunchao Zhang #endif
2165d460d7bfSJunchao Zhang   }
2166d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2167d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2168d460d7bfSJunchao Zhang 
2169d460d7bfSJunchao Zhang   /* get the triangular factors */
2170d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2171d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2172d460d7bfSJunchao Zhang }
2173d460d7bfSJunchao Zhang 
2174d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2175d460d7bfSJunchao Zhang {
2176d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2177d460d7bfSJunchao Zhang 
2178d460d7bfSJunchao Zhang   PetscFunctionBegin;
2179d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2180d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2181d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2182d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2183d460d7bfSJunchao Zhang }
2184d460d7bfSJunchao Zhang 
2185d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2186d71ae5a4SJacob Faibussowitsch {
2187da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2188da112707SJunchao Zhang 
2189da112707SJunchao Zhang   PetscFunctionBegin;
2190b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2191bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2192bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
2193da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2194da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2195bc996fdcSJunchao Zhang   }
2196da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2197da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2198da112707SJunchao Zhang   } else
2199da112707SJunchao Zhang #endif
2200da112707SJunchao Zhang   {
2201da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2202da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2203da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2204da112707SJunchao Zhang   }
22053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2206da112707SJunchao Zhang }
2207da112707SJunchao Zhang 
2208d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2209d71ae5a4SJacob Faibussowitsch {
2210da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2211da112707SJunchao Zhang 
2212da112707SJunchao Zhang   PetscFunctionBegin;
2213b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2214bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2215bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2216da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2217da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2218da112707SJunchao Zhang   } else
2219da112707SJunchao Zhang #endif
2220da112707SJunchao Zhang   {
2221da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2222da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2223da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2224da112707SJunchao Zhang   }
22253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2226da112707SJunchao Zhang }
2227da112707SJunchao Zhang 
2228d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2229d71ae5a4SJacob Faibussowitsch {
2230da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2231da112707SJunchao Zhang 
2232da112707SJunchao Zhang   PetscFunctionBegin;
2233da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2234da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2235da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2237da112707SJunchao Zhang }
2238da112707SJunchao Zhang 
223966976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2240d71ae5a4SJacob Faibussowitsch {
2241841d4cb1SJunchao Zhang   PetscFunctionBegin;
2242841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2244841d4cb1SJunchao Zhang }
2245841d4cb1SJunchao Zhang 
2246841d4cb1SJunchao Zhang /*MC
2247841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
224811a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2249841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2250841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
225111a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2252841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2253841d4cb1SJunchao Zhang 
2254841d4cb1SJunchao Zhang   Level: beginner
2255841d4cb1SJunchao Zhang 
22561cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22572ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2258841d4cb1SJunchao Zhang M*/
2259841d4cb1SJunchao Zhang 
2260d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2261d71ae5a4SJacob Faibussowitsch {
2262841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
2263bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
2264bc996fdcSJunchao Zhang   char     *prefix;
2265bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
2266841d4cb1SJunchao Zhang 
2267841d4cb1SJunchao Zhang   PetscFunctionBegin;
2268841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2269841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2270b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2271841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2272841d4cb1SJunchao Zhang 
2273bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2274f4f49eeaSPierre Jolivet   PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
2275bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2276bc996fdcSJunchao Zhang   PetscOptionsEnd();
2277bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2278bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2279f4f49eeaSPierre Jolivet   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2280bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2281bc996fdcSJunchao Zhang 
2282841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2283841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2284841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2285841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2286841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2287841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2288841d4cb1SJunchao Zhang     } else {
2289841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2290841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2291841d4cb1SJunchao Zhang     }
2292841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2293841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2294841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2295841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2296841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2297841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2298841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2299841d4cb1SJunchao Zhang     } else {
2300841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2301841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2302841d4cb1SJunchao Zhang     }
2303841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2304841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2305841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2306841d4cb1SJunchao Zhang 
2307841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2308841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2309f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2311841d4cb1SJunchao Zhang }
2312841d4cb1SJunchao Zhang 
2313d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2314d71ae5a4SJacob Faibussowitsch {
23157e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23167e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2317b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2318da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23190dd8c0acSJunchao Zhang #endif
23207e8381f9SStefano Zampini 
23217e8381f9SStefano Zampini   PetscFunctionBegin;
23227e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23239566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2324da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2325da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23269566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2327da112707SJunchao Zhang     }
2328b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2329da112707SJunchao Zhang     else if (fs->csrVal) {
2330da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2331da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2332da112707SJunchao Zhang     }
2333da112707SJunchao Zhang #endif
23349371c9d4SSatish Balay     else
23359371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23369566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23379566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23387e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23397e8381f9SStefano Zampini   }
23403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23417e8381f9SStefano Zampini }
23427e8381f9SStefano Zampini 
2343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2344d71ae5a4SJacob Faibussowitsch {
23457e8381f9SStefano Zampini   PetscFunctionBegin;
23469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
234767a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
234967a45760SJunchao Zhang }
235067a45760SJunchao Zhang 
2351d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2352d71ae5a4SJacob Faibussowitsch {
235367a45760SJunchao Zhang   PetscFunctionBegin;
23547e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
235567a45760SJunchao Zhang   *array         = NULL;
23563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235767a45760SJunchao Zhang }
235867a45760SJunchao Zhang 
2359d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2360d71ae5a4SJacob Faibussowitsch {
236167a45760SJunchao Zhang   PetscFunctionBegin;
23629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
236367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236567a45760SJunchao Zhang }
236667a45760SJunchao Zhang 
23678eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2368d71ae5a4SJacob Faibussowitsch {
236967a45760SJunchao Zhang   PetscFunctionBegin;
237067a45760SJunchao Zhang   *array = NULL;
23713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237267a45760SJunchao Zhang }
237367a45760SJunchao Zhang 
2374d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2375d71ae5a4SJacob Faibussowitsch {
237667a45760SJunchao Zhang   PetscFunctionBegin;
237767a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237967a45760SJunchao Zhang }
238067a45760SJunchao Zhang 
2381d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2382d71ae5a4SJacob Faibussowitsch {
238367a45760SJunchao Zhang   PetscFunctionBegin;
238467a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
238567a45760SJunchao Zhang   *array         = NULL;
23863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23877e8381f9SStefano Zampini }
23887e8381f9SStefano Zampini 
2389d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2390d71ae5a4SJacob Faibussowitsch {
23917ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
23927ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
23937ee59b9bSJunchao Zhang 
23947ee59b9bSJunchao Zhang   PetscFunctionBegin;
23957ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23967ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
23977ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
23987ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
23997ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
24007ee59b9bSJunchao Zhang 
24017ee59b9bSJunchao Zhang   if (i) {
24027ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24037ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24047ee59b9bSJunchao Zhang #else
24057ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24067ee59b9bSJunchao Zhang #endif
24077ee59b9bSJunchao Zhang   }
24087ee59b9bSJunchao Zhang   if (j) {
24097ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24107ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24117ee59b9bSJunchao Zhang #else
24127ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24137ee59b9bSJunchao Zhang #endif
24147ee59b9bSJunchao Zhang   }
24157ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24167ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24187ee59b9bSJunchao Zhang }
24197ee59b9bSJunchao Zhang 
2420d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2421d71ae5a4SJacob Faibussowitsch {
2422aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24237c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24249ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2425213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2426aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2427abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24289ae82921SPaul Mullowney 
24299ae82921SPaul Mullowney   PetscFunctionBegin;
243028b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2431c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2432a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2433a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2434afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
243585ba7357SStefano Zampini 
243608401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24379566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2438afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24399566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2440f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24419566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24429566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
244334d6c7a5SJose E. Roman     } else {
2444abb89eb1SStefano Zampini       PetscInt nnz;
24459566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24469566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24479566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24487c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
244981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2450a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2451a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24529ae82921SPaul Mullowney       try {
24539ae82921SPaul Mullowney         if (a->compressedrow.use) {
24549ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24559ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24569ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24579ae82921SPaul Mullowney         } else {
2458213423ffSJunchao Zhang           m    = A->rmap->n;
2459213423ffSJunchao Zhang           ii   = a->i;
2460e6e9a74fSStefano Zampini           ridx = NULL;
24619ae82921SPaul Mullowney         }
246208401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24639371c9d4SSatish Balay         if (!a->a) {
24649371c9d4SSatish Balay           nnz  = ii[m];
24659371c9d4SSatish Balay           both = PETSC_FALSE;
24669371c9d4SSatish Balay         } else nnz = a->nz;
246708401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24689ae82921SPaul Mullowney 
246985ba7357SStefano Zampini         /* create cusparse matrix */
2470abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2471aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24759ae82921SPaul Mullowney 
2476f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2477f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2478f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
24799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24809566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24819566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24829566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2483b06137fdSPaul Mullowney 
2484aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2485aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2486aa372e3fSPaul Mullowney           /* set the matrix */
2487afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2488afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2489afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2490abb89eb1SStefano Zampini           mat->num_entries = nnz;
2491ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2492afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
24939ae82921SPaul Mullowney 
2494ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2495abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2496aa372e3fSPaul Mullowney 
2497ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2498abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2499aa372e3fSPaul Mullowney 
2500aa372e3fSPaul Mullowney           /* assign the pointer */
2501afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2502afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2503afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25049371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25059371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25069371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2507afb2bd1cSJunchao Zhang           }
2508afb2bd1cSJunchao Zhang #endif
2509aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2510afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2511afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2512afb2bd1cSJunchao Zhang #else
2513afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2514afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2515afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2516abb89eb1SStefano Zampini           mat->num_entries = nnz;
2517ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2518afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2519aa372e3fSPaul Mullowney 
2520ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2521abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2522aa372e3fSPaul Mullowney 
2523ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2524abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2525aa372e3fSPaul Mullowney 
2526aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25279566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25289371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25299371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25309371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2531aa372e3fSPaul Mullowney           /* assign the pointer */
2532aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2533aa372e3fSPaul Mullowney 
2534afb2bd1cSJunchao Zhang           if (mat) {
2535afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2536afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2537afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2538afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2539087f3262SPaul Mullowney           }
2540afb2bd1cSJunchao Zhang #endif
2541087f3262SPaul Mullowney         }
2542ca45077fSPaul Mullowney 
2543aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2544213423ffSJunchao Zhang         if (a->compressedrow.use) {
2545ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2546ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2547aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2548213423ffSJunchao Zhang           tmp = m;
2549213423ffSJunchao Zhang         } else {
2550213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2551213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2552213423ffSJunchao Zhang           tmp                        = 0;
2553213423ffSJunchao Zhang         }
25549566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2555aa372e3fSPaul Mullowney 
2556aa372e3fSPaul Mullowney         /* assign the pointer */
2557aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2558d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2559d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2560d71ae5a4SJacob Faibussowitsch       }
25619566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25629566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
256334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
256434d6c7a5SJose E. Roman     }
2565abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25669ae82921SPaul Mullowney   }
25673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25689ae82921SPaul Mullowney }
25699ae82921SPaul Mullowney 
25709371c9d4SSatish Balay struct VecCUDAPlusEquals {
2571aa372e3fSPaul Mullowney   template <typename Tuple>
2572d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2573d71ae5a4SJacob Faibussowitsch   {
2574aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2575aa372e3fSPaul Mullowney   }
2576aa372e3fSPaul Mullowney };
2577aa372e3fSPaul Mullowney 
25789371c9d4SSatish Balay struct VecCUDAEquals {
25797e8381f9SStefano Zampini   template <typename Tuple>
2580d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2581d71ae5a4SJacob Faibussowitsch   {
25827e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
25837e8381f9SStefano Zampini   }
25847e8381f9SStefano Zampini };
25857e8381f9SStefano Zampini 
25869371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2587e6e9a74fSStefano Zampini   template <typename Tuple>
2588d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2589d71ae5a4SJacob Faibussowitsch   {
2590e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2591e6e9a74fSStefano Zampini   }
2592e6e9a74fSStefano Zampini };
2593e6e9a74fSStefano Zampini 
2594afb2bd1cSJunchao Zhang struct MatMatCusparse {
2595ccdfe979SStefano Zampini   PetscBool      cisdense;
2596ccdfe979SStefano Zampini   PetscScalar   *Bt;
2597ccdfe979SStefano Zampini   Mat            X;
2598fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2599fcdce8c4SStefano Zampini   PetscLogDouble flops;
2600fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2601b4285af6SJunchao Zhang 
2602afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2603fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2604afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2605afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2606afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2607afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2608b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2609b4285af6SJunchao Zhang   void *dBuffer4;
2610b4285af6SJunchao Zhang   void *dBuffer5;
2611b4285af6SJunchao Zhang   #endif
2612fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2613fcdce8c4SStefano Zampini   void                 *mmBuffer;
2614fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2615fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2616afb2bd1cSJunchao Zhang #endif
2617afb2bd1cSJunchao Zhang };
2618ccdfe979SStefano Zampini 
2619d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2620d71ae5a4SJacob Faibussowitsch {
2621ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2622ccdfe979SStefano Zampini 
2623ccdfe979SStefano Zampini   PetscFunctionBegin;
26249566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2625fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2626afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26279566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26289566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26299566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26309566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2631b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26329566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26339566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2634b4285af6SJunchao Zhang   #endif
26359566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26369566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2637afb2bd1cSJunchao Zhang #endif
26389566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
26399566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
26403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2641ccdfe979SStefano Zampini }
2642ccdfe979SStefano Zampini 
26434742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2644ccdfe979SStefano Zampini 
2645d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2646d71ae5a4SJacob Faibussowitsch {
2647ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2648ccdfe979SStefano Zampini   Mat                           A, B;
2649afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2650ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2651ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2652ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2653ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2654ccdfe979SStefano Zampini   const PetscScalar            *barray;
2655ccdfe979SStefano Zampini   PetscScalar                  *carray;
2656ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2657ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2658ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2659ccdfe979SStefano Zampini 
2660ccdfe979SStefano Zampini   PetscFunctionBegin;
2661ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
266228b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2663ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2664ccdfe979SStefano Zampini   A      = product->A;
2665ccdfe979SStefano Zampini   B      = product->B;
26669566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
266728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2668ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2669ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
267028b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2672ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2673ccdfe979SStefano Zampini   switch (product->type) {
2674ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2675ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2676ccdfe979SStefano Zampini     mat = cusp->mat;
2677ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2678ccdfe979SStefano Zampini     m   = A->rmap->n;
2679ccdfe979SStefano Zampini     n   = B->cmap->n;
2680ccdfe979SStefano Zampini     break;
2681ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
26821a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2683e6e9a74fSStefano Zampini       mat = cusp->mat;
2684e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2685e6e9a74fSStefano Zampini     } else {
26869566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2687ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2688ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2689e6e9a74fSStefano Zampini     }
2690ccdfe979SStefano Zampini     m = A->cmap->n;
2691ccdfe979SStefano Zampini     n = B->cmap->n;
2692ccdfe979SStefano Zampini     break;
2693ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2694ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2695ccdfe979SStefano Zampini     mat = cusp->mat;
2696ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2697ccdfe979SStefano Zampini     m   = A->rmap->n;
2698ccdfe979SStefano Zampini     n   = B->rmap->n;
2699ccdfe979SStefano Zampini     break;
2700d71ae5a4SJacob Faibussowitsch   default:
2701d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2702ccdfe979SStefano Zampini   }
270328b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2704ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2705ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27069566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27079566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2708cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2709afb2bd1cSJunchao Zhang 
27109566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2711c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2712cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27139566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2714c8378d12SStefano Zampini   } else {
2715cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27169566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2717c8378d12SStefano Zampini   }
2718c8378d12SStefano Zampini 
27199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2720afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2721afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2722fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
2723fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA];
2724fe5544b9SJunchao Zhang   #else
2725fe5544b9SJunchao Zhang   cusparseSpMatDescr_t &matADescr = mat->matDescr;
2726fe5544b9SJunchao Zhang   #endif
2727fe5544b9SJunchao Zhang 
2728a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2729afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2730fcdce8c4SStefano Zampini     size_t mmBufferSize;
27319371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27329371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27339371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27349371c9d4SSatish Balay     }
2735afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27369566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2737afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2738afb2bd1cSJunchao Zhang     }
2739c8378d12SStefano Zampini 
27409371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27419371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27429371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27439371c9d4SSatish Balay     }
2744afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27459566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2746afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2747afb2bd1cSJunchao Zhang     }
2748afb2bd1cSJunchao Zhang 
2749fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0
2750fe5544b9SJunchao Zhang     if (matADescr) {
275117f5f06fSJunchao Zhang       PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug
2752fe5544b9SJunchao Zhang       matADescr = NULL;
2753fe5544b9SJunchao Zhang     }
2754fe5544b9SJunchao Zhang   #endif
2755fe5544b9SJunchao Zhang 
2756fe5544b9SJunchao Zhang     if (!matADescr) {
2757fe5544b9SJunchao Zhang       stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27589371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27599371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2760afb2bd1cSJunchao Zhang     }
2761fe5544b9SJunchao Zhang 
2762fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize));
2763fe5544b9SJunchao Zhang 
2764fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27659566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27669566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2767fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2768fcdce8c4SStefano Zampini     }
2769fe5544b9SJunchao Zhang 
2770fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but petsc worked without it until 12.4.0
2771fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2772fe5544b9SJunchao Zhang   #endif
2773fe5544b9SJunchao Zhang 
2774afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2775afb2bd1cSJunchao Zhang   } else {
2776afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2777fe5544b9SJunchao Zhang     PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get()));
27789566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
27799566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2780afb2bd1cSJunchao Zhang   }
2781afb2bd1cSJunchao Zhang 
2782afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2783fe5544b9SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer));
2784afb2bd1cSJunchao Zhang #else
2785afb2bd1cSJunchao Zhang   PetscInt k;
2786afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2787ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2788ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2789ccdfe979SStefano Zampini     cublasStatus_t cerr;
2790ccdfe979SStefano Zampini 
27919566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
27929371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
27939371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2794ccdfe979SStefano Zampini     blda = B->cmap->n;
2795afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2796afb2bd1cSJunchao Zhang   } else {
2797afb2bd1cSJunchao Zhang     k = B->rmap->n;
2798ccdfe979SStefano Zampini   }
2799ccdfe979SStefano Zampini 
2800afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
28019371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
28029371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2803afb2bd1cSJunchao Zhang #endif
28049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
28059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2806cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2807ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2808cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28094742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2810ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2811cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
28124742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2813ccdfe979SStefano Zampini   } else {
2814cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2815ccdfe979SStefano Zampini   }
281648a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
281748a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
28183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2819ccdfe979SStefano Zampini }
2820ccdfe979SStefano Zampini 
2821d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2822d71ae5a4SJacob Faibussowitsch {
2823ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2824ccdfe979SStefano Zampini   Mat                 A, B;
2825ccdfe979SStefano Zampini   PetscInt            m, n;
2826ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2827ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2828ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2829ccdfe979SStefano Zampini 
2830ccdfe979SStefano Zampini   PetscFunctionBegin;
2831ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
283228b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2833ccdfe979SStefano Zampini   A = product->A;
2834ccdfe979SStefano Zampini   B = product->B;
28359566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
283628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2837ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
283808401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2839ccdfe979SStefano Zampini   switch (product->type) {
2840ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2841ccdfe979SStefano Zampini     m = A->rmap->n;
2842ccdfe979SStefano Zampini     n = B->cmap->n;
2843*0e6a1e94SMark Adams     PetscCall(MatSetBlockSizesFromMats(C, A, B));
2844ccdfe979SStefano Zampini     break;
2845ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2846ccdfe979SStefano Zampini     m = A->cmap->n;
2847ccdfe979SStefano Zampini     n = B->cmap->n;
2848*0e6a1e94SMark Adams     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
2849*0e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2850ccdfe979SStefano Zampini     break;
2851ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2852ccdfe979SStefano Zampini     m = A->rmap->n;
2853ccdfe979SStefano Zampini     n = B->rmap->n;
2854*0e6a1e94SMark Adams     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
2855*0e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2856ccdfe979SStefano Zampini     break;
2857ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2858ccdfe979SStefano Zampini     m = B->cmap->n;
2859ccdfe979SStefano Zampini     n = B->cmap->n;
2860*0e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs));
2861*0e6a1e94SMark Adams     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
2862ccdfe979SStefano Zampini     break;
2863ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2864ccdfe979SStefano Zampini     m = B->rmap->n;
2865ccdfe979SStefano Zampini     n = B->rmap->n;
2866*0e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs));
2867*0e6a1e94SMark Adams     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
2868ccdfe979SStefano Zampini     break;
2869d71ae5a4SJacob Faibussowitsch   default:
2870d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2871ccdfe979SStefano Zampini   }
28729566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2873ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28749566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28759566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2876ccdfe979SStefano Zampini 
2877ccdfe979SStefano Zampini   /* product data */
28789566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2879ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2880afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2881afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
288248a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2883afb2bd1cSJunchao Zhang #endif
2884ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2885ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28869566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
28879566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2888ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
28899566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2890ccdfe979SStefano Zampini     } else {
28919566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2892ccdfe979SStefano Zampini     }
2893ccdfe979SStefano Zampini   }
2894ccdfe979SStefano Zampini   C->product->data    = mmdata;
2895ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2896ccdfe979SStefano Zampini 
2897ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
28983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2899ccdfe979SStefano Zampini }
2900ccdfe979SStefano Zampini 
2901d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2902d71ae5a4SJacob Faibussowitsch {
2903ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2904fcdce8c4SStefano Zampini   Mat                           A, B;
2905fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2906fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2907fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2908fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2909fcdce8c4SStefano Zampini   PetscBool                     flg;
2910fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2911fcdce8c4SStefano Zampini   MatProductType                ptype;
2912fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2913fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2914fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2915fcdce8c4SStefano Zampini #endif
2916b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2917ccdfe979SStefano Zampini 
2918ccdfe979SStefano Zampini   PetscFunctionBegin;
2919ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
292028b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
29219566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
292228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2923fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2924fcdce8c4SStefano Zampini   A      = product->A;
2925fcdce8c4SStefano Zampini   B      = product->B;
2926fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2927fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2928fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
292908401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2930fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
293128b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2932fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
293328b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2934fcdce8c4SStefano Zampini     goto finalize;
2935fcdce8c4SStefano Zampini   }
2936fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29379566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
293828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29399566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
294028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
294128b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
294228b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2943fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2944fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2945fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
294608401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294708401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
294808401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2951fcdce8c4SStefano Zampini 
2952fcdce8c4SStefano Zampini   ptype = product->type;
2953b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2954fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295528b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2956fa046f9fSJunchao Zhang   }
2957b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2958fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
295928b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2960fa046f9fSJunchao Zhang   }
2961fcdce8c4SStefano Zampini   switch (ptype) {
2962fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2963fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2964fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2965fcdce8c4SStefano Zampini     break;
2966fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2967fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2968fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2969fcdce8c4SStefano Zampini     break;
2970fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2971fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2972fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2973fcdce8c4SStefano Zampini     break;
2974d71ae5a4SJacob Faibussowitsch   default:
2975d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2976fcdce8c4SStefano Zampini   }
2977fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
297828b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
297928b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
298028b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2981fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2982fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2983fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
298428b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
298528b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
298628b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
29879566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2988fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2989fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
29909566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2991b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
29929371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29939371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2994b4285af6SJunchao Zhang   #else
29959371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29969371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29979371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29989371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2999b4285af6SJunchao Zhang   #endif
3000fcdce8c4SStefano Zampini #else
30019371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30029371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30039371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3004fcdce8c4SStefano Zampini #endif
30059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30069566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
30079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3008fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3009fcdce8c4SStefano Zampini finalize:
3010fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
30119566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
30129566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
30139566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
3014fcdce8c4SStefano Zampini   c->reallocs = 0;
3015fcdce8c4SStefano Zampini   C->info.mallocs += 0;
3016fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3017fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3018fcdce8c4SStefano Zampini   C->num_ass++;
30193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3020ccdfe979SStefano Zampini }
3021fcdce8c4SStefano Zampini 
3022d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3023d71ae5a4SJacob Faibussowitsch {
3024fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3025fcdce8c4SStefano Zampini   Mat                           A, B;
3026fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
3027fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3028fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3029fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3030fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3031fcdce8c4SStefano Zampini   PetscBool                     flg;
3032fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3033fcdce8c4SStefano Zampini   MatProductType                ptype;
3034fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3035fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3036fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3037fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3038fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3039fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3040fcdce8c4SStefano Zampini #else
3041fcdce8c4SStefano Zampini   int cnz;
3042fcdce8c4SStefano Zampini #endif
3043b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3044fcdce8c4SStefano Zampini 
3045fcdce8c4SStefano Zampini   PetscFunctionBegin;
3046fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
304728b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3048fcdce8c4SStefano Zampini   A = product->A;
3049fcdce8c4SStefano Zampini   B = product->B;
30509566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
305128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30529566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
305328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3054fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3055fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3056fcdce8c4SStefano Zampini   /* product data */
30579566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3058fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3059fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3060fcdce8c4SStefano Zampini 
30619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3063d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3064d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
306508401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
306608401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3067d60bce21SJunchao Zhang 
3068fcdce8c4SStefano Zampini   ptype = product->type;
3069b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3070fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3071fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3072fa046f9fSJunchao Zhang   }
3073b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3074fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3075fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3076fa046f9fSJunchao Zhang   }
3077fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3078fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3079fcdce8c4SStefano Zampini   switch (ptype) {
3080fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3081fcdce8c4SStefano Zampini     m    = A->rmap->n;
3082fcdce8c4SStefano Zampini     n    = B->cmap->n;
3083fcdce8c4SStefano Zampini     k    = A->cmap->n;
3084fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3085fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3086fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3087fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3088fcdce8c4SStefano Zampini     break;
3089fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3090fcdce8c4SStefano Zampini     m = A->cmap->n;
3091fcdce8c4SStefano Zampini     n = B->cmap->n;
3092fcdce8c4SStefano Zampini     k = A->rmap->n;
30939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3094fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3095fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3096fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3097fcdce8c4SStefano Zampini     break;
3098fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3099fcdce8c4SStefano Zampini     m = A->rmap->n;
3100fcdce8c4SStefano Zampini     n = B->rmap->n;
3101fcdce8c4SStefano Zampini     k = A->cmap->n;
31029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3103fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3104fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3105fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3106fcdce8c4SStefano Zampini     break;
3107d71ae5a4SJacob Faibussowitsch   default:
3108d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3109fcdce8c4SStefano Zampini   }
3110fcdce8c4SStefano Zampini 
3111fcdce8c4SStefano Zampini   /* create cusparse matrix */
31129566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
31139566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3114fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3115fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3116fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3117fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3118fcdce8c4SStefano Zampini 
3119fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3120fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3121fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
31229566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
31239566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3124fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3125fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3126fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3127fcdce8c4SStefano Zampini   } else {
3128fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3129fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3130fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3131fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3132fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3133fcdce8c4SStefano Zampini   }
3134fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3135fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3136fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3137fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3138fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3139fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31409566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31419566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31429566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3143f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3144f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3145f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31469566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3149fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3150d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3151fcdce8c4SStefano Zampini     c->nz                = 0;
3152fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3153fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3154fcdce8c4SStefano Zampini     goto finalizesym;
3155fcdce8c4SStefano Zampini   }
3156fcdce8c4SStefano Zampini 
315728b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
315828b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3159fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3160fcdce8c4SStefano Zampini   if (!biscompressed) {
3161fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3162fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3163fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3164fcdce8c4SStefano Zampini #endif
3165fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3166fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3167fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3168fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3169fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3170fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3171fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3172fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3173fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3174fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3175fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31769566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3177fcdce8c4SStefano Zampini     }
3178fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3179fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3180fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3181fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
31829371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31839371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3184fcdce8c4SStefano Zampini     }
3185fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3186fcdce8c4SStefano Zampini #endif
3187fcdce8c4SStefano Zampini   }
318828b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
318928b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3190fcdce8c4SStefano Zampini   /* precompute flops count */
3191fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3192fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3193fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3194fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3195fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3196fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3197fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3198fcdce8c4SStefano Zampini       }
3199fcdce8c4SStefano Zampini     }
3200fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3201fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3202fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3203fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3204fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3205fcdce8c4SStefano Zampini     }
3206fcdce8c4SStefano Zampini   } else { /* TODO */
3207fcdce8c4SStefano Zampini     flops = 0.;
3208fcdce8c4SStefano Zampini   }
3209fcdce8c4SStefano Zampini 
3210fcdce8c4SStefano Zampini   mmdata->flops = flops;
32119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3212b4285af6SJunchao Zhang 
3213fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
32149566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
32151ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
32161ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
32179371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3219b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3220b4285af6SJunchao Zhang   {
3221b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3222b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3223b4285af6SJunchao Zhang   */
3224b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3225b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3226b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3227b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3228b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3229b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3230b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3231b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3232b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3233b4285af6SJunchao Zhang 
3234b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32359371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32369371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3238b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32399371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32409371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3241b4285af6SJunchao Zhang 
32429371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32439371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32479371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32489371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3251b4285af6SJunchao Zhang 
3252b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32539566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3254b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3255b4285af6SJunchao Zhang     /* allocate matrix C */
32569371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32579371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32589371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32599371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3260b4285af6SJunchao Zhang     /* update matC with the new pointers */
32619371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32629371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3263b4285af6SJunchao Zhang 
32649371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32659371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32679371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32689371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32699566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32709371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32719371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32729566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3273b4285af6SJunchao Zhang   }
3274ae37ee31SJunchao Zhang   #else
3275b4285af6SJunchao Zhang   size_t bufSize2;
3276fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
32779371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
32789371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32799566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3280fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
32819371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
32829371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3283fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
32849371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
32859371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3286fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3287fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3288fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3289fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3290fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
32919566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3292fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
32939371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
32949371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3295fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
32969566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3297fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
32989371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
32999371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3300fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33019566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3302fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33039566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
33049371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
33059371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
33069371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
33079371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3308ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3309fcdce8c4SStefano Zampini #else
33109566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
33119371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33129371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
33139371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3314fcdce8c4SStefano Zampini   c->nz                = cnz;
3315fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
33169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3317fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
33189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3319fcdce8c4SStefano Zampini 
33209566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3321fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3322fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3323fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
33249371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
33259371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
33269371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3327fcdce8c4SStefano Zampini #endif
33289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3330fcdce8c4SStefano Zampini finalizesym:
3331fcdce8c4SStefano Zampini   c->free_a = PETSC_TRUE;
33329f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
33339f0612e4SBarry Smith   PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
3334fcdce8c4SStefano Zampini   c->free_ij = PETSC_TRUE;
33357de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3336fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3337fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3338fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3339fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3340fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3341fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3344fcdce8c4SStefano Zampini   } else {
3345fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3346fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3349fcdce8c4SStefano Zampini   }
3350fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3351fcdce8c4SStefano Zampini     PetscInt r = 0;
3352fcdce8c4SStefano Zampini     c->i[0]    = 0;
3353fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3354fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3355fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3356fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3357fcdce8c4SStefano Zampini     }
3358fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3359fcdce8c4SStefano Zampini   }
33609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33619566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33629566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3363fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3364fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3365fcdce8c4SStefano Zampini   c->rmax          = 0;
3366fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3367fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3368fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3369fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3370fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3371fcdce8c4SStefano Zampini   }
33729566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
33739566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3374fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3375fcdce8c4SStefano Zampini 
3376fcdce8c4SStefano Zampini   C->nonzerostate++;
33779566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
33789566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3379fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3380fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3381fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3382fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3383fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3384abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3385fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3386fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3387fcdce8c4SStefano Zampini   }
3388fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
33893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3390fcdce8c4SStefano Zampini }
3391fcdce8c4SStefano Zampini 
3392fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3393fcdce8c4SStefano Zampini 
3394fcdce8c4SStefano Zampini /* handles sparse or dense B */
3395d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3396d71ae5a4SJacob Faibussowitsch {
3397fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3398fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3399fcdce8c4SStefano Zampini 
3400fcdce8c4SStefano Zampini   PetscFunctionBegin;
3401fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
34029566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
340348a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3404fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3405fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
340648a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3407fcdce8c4SStefano Zampini   }
340865e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
340965e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
341065e4b4d4SStefano Zampini     switch (product->type) {
341165e4b4d4SStefano Zampini     case MATPRODUCT_AB:
341265e4b4d4SStefano Zampini       if (product->api_user) {
3413d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
34149566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3415d0609cedSBarry Smith         PetscOptionsEnd();
341665e4b4d4SStefano Zampini       } else {
3417d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
34189566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3419d0609cedSBarry Smith         PetscOptionsEnd();
342065e4b4d4SStefano Zampini       }
342165e4b4d4SStefano Zampini       break;
342265e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
342365e4b4d4SStefano Zampini       if (product->api_user) {
3424d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
34259566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3426d0609cedSBarry Smith         PetscOptionsEnd();
342765e4b4d4SStefano Zampini       } else {
3428d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34299566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3430d0609cedSBarry Smith         PetscOptionsEnd();
343165e4b4d4SStefano Zampini       }
343265e4b4d4SStefano Zampini       break;
343365e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
343465e4b4d4SStefano Zampini       if (product->api_user) {
3435d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34369566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3437d0609cedSBarry Smith         PetscOptionsEnd();
343865e4b4d4SStefano Zampini       } else {
3439d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34409566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3441d0609cedSBarry Smith         PetscOptionsEnd();
344265e4b4d4SStefano Zampini       }
344365e4b4d4SStefano Zampini       break;
344465e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
344565e4b4d4SStefano Zampini       if (product->api_user) {
3446d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34479566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3448d0609cedSBarry Smith         PetscOptionsEnd();
344965e4b4d4SStefano Zampini       } else {
3450d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34519566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3452d0609cedSBarry Smith         PetscOptionsEnd();
345365e4b4d4SStefano Zampini       }
345465e4b4d4SStefano Zampini       break;
345565e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
345665e4b4d4SStefano Zampini       if (product->api_user) {
3457d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34589566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3459d0609cedSBarry Smith         PetscOptionsEnd();
346065e4b4d4SStefano Zampini       } else {
3461d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34629566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3463d0609cedSBarry Smith         PetscOptionsEnd();
346465e4b4d4SStefano Zampini       }
346565e4b4d4SStefano Zampini       break;
3466d71ae5a4SJacob Faibussowitsch     default:
3467d71ae5a4SJacob Faibussowitsch       break;
346865e4b4d4SStefano Zampini     }
346965e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
347065e4b4d4SStefano Zampini   }
347165e4b4d4SStefano Zampini   /* dispatch */
3472fcdce8c4SStefano Zampini   if (isdense) {
3473ccdfe979SStefano Zampini     switch (product->type) {
3474ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3475ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3476ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3477ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3478ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3479fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
34809566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3481fcdce8c4SStefano Zampini       } else {
3482fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3483fcdce8c4SStefano Zampini       }
3484fcdce8c4SStefano Zampini       break;
3485d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3486d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3487d71ae5a4SJacob Faibussowitsch       break;
3488d71ae5a4SJacob Faibussowitsch     default:
3489d71ae5a4SJacob Faibussowitsch       break;
3490ccdfe979SStefano Zampini     }
3491fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3492fcdce8c4SStefano Zampini     switch (product->type) {
3493fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3494fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3495d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3496d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3497d71ae5a4SJacob Faibussowitsch       break;
3498fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3499fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3500d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3501d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3502d71ae5a4SJacob Faibussowitsch       break;
3503d71ae5a4SJacob Faibussowitsch     default:
3504d71ae5a4SJacob Faibussowitsch       break;
3505fcdce8c4SStefano Zampini     }
3506fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
35079566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3508fcdce8c4SStefano Zampini   }
35093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3510ccdfe979SStefano Zampini }
3511ccdfe979SStefano Zampini 
3512d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3513d71ae5a4SJacob Faibussowitsch {
35149ae82921SPaul Mullowney   PetscFunctionBegin;
35159566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
35163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3517e6e9a74fSStefano Zampini }
3518e6e9a74fSStefano Zampini 
3519d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3520d71ae5a4SJacob Faibussowitsch {
3521e6e9a74fSStefano Zampini   PetscFunctionBegin;
35229566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
35233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3524e6e9a74fSStefano Zampini }
3525e6e9a74fSStefano Zampini 
3526d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3527d71ae5a4SJacob Faibussowitsch {
3528e6e9a74fSStefano Zampini   PetscFunctionBegin;
35299566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3531e6e9a74fSStefano Zampini }
3532e6e9a74fSStefano Zampini 
3533d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3534d71ae5a4SJacob Faibussowitsch {
3535e6e9a74fSStefano Zampini   PetscFunctionBegin;
35369566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35389ae82921SPaul Mullowney }
35399ae82921SPaul Mullowney 
3540d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3541d71ae5a4SJacob Faibussowitsch {
3542ca45077fSPaul Mullowney   PetscFunctionBegin;
35439566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3545ca45077fSPaul Mullowney }
3546ca45077fSPaul Mullowney 
3547d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3548d71ae5a4SJacob Faibussowitsch {
3549a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3550a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3551a0e72f99SJunchao Zhang }
3552a0e72f99SJunchao Zhang 
3553afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3554d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3555d71ae5a4SJacob Faibussowitsch {
35569ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3557aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35589ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3559e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3560e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3561e6e9a74fSStefano Zampini   PetscBool                     compressed;
3562afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3563afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3564afb2bd1cSJunchao Zhang #endif
35656e111a19SKarl Rupp 
35669ae82921SPaul Mullowney   PetscFunctionBegin;
356708401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3568cbc6b225SStefano Zampini   if (!a->nz) {
3569995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3570995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35713ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3572e6e9a74fSStefano Zampini   }
357334d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3575e6e9a74fSStefano Zampini   if (!trans) {
35769ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
35775f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3578e6e9a74fSStefano Zampini   } else {
35791a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3580e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3581e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3582e6e9a74fSStefano Zampini     } else {
35839566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3584e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3585e6e9a74fSStefano Zampini     }
3586e6e9a74fSStefano Zampini   }
3587e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3588e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3589213423ffSJunchao Zhang 
3590e6e9a74fSStefano Zampini   try {
35919566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
359269d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
35939566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3594afb2bd1cSJunchao Zhang 
35959566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3596e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3597afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3598afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3599afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3600afb2bd1cSJunchao Zhang       */
3601e6e9a74fSStefano Zampini       xptr = xarray;
3602afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3603213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3604afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3605afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3606afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3607afb2bd1cSJunchao Zhang        */
3608afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3609afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3610fe5544b9SJunchao Zhang         nx             = mat->num_cols; // since y = Ax
3611afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3612afb2bd1cSJunchao Zhang       }
3613afb2bd1cSJunchao Zhang #endif
3614e6e9a74fSStefano Zampini     } else {
3615afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3616afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3617afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3618afb2bd1cSJunchao Zhang        */
3619afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3620e6e9a74fSStefano Zampini       dptr = zarray;
3621e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3622afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3623e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3624d0967f54SJacob Faibussowitsch 
3625d0967f54SJacob Faibussowitsch         thrust::for_each(
3626d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3627d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3628d0967f54SJacob Faibussowitsch #endif
3629d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36309371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3631e6e9a74fSStefano Zampini       }
3632afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3633afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3634afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3635fe5544b9SJunchao Zhang         nx             = mat->num_rows; // since y = A^T x
3636afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3637afb2bd1cSJunchao Zhang       }
3638afb2bd1cSJunchao Zhang #endif
3639e6e9a74fSStefano Zampini     }
36409ae82921SPaul Mullowney 
3641afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3642aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3643afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3644fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3645fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA.
3646fe5544b9SJunchao Zhang   #else
3647fe5544b9SJunchao Zhang       cusparseSpMatDescr_t &matDescr = matstruct->matDescr;
3648fe5544b9SJunchao Zhang   #endif
3649fe5544b9SJunchao Zhang 
36505f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3651fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
3652fe5544b9SJunchao Zhang       if (!matDescr) {
3653fe5544b9SJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3654fe5544b9SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
3655fe5544b9SJunchao Zhang       }
3656fe5544b9SJunchao Zhang   #endif
3657fe5544b9SJunchao Zhang 
3658afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36599566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36609566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36619371c9d4SSatish Balay         PetscCallCUSPARSE(
3662fe5544b9SJunchao Zhang           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36639566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3664fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4
3665fe5544b9SJunchao Zhang         PetscCallCUSPARSE(
3666fe5544b9SJunchao Zhang           cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3667fe5544b9SJunchao Zhang   #endif
3668afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3669afb2bd1cSJunchao Zhang       } else {
3670afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3673afb2bd1cSJunchao Zhang       }
3674afb2bd1cSJunchao Zhang 
3675fe5544b9SJunchao Zhang       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3676afb2bd1cSJunchao Zhang #else
36777656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
36789371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3679afb2bd1cSJunchao Zhang #endif
3680aa372e3fSPaul Mullowney     } else {
3681213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3682afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3683afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3684afb2bd1cSJunchao Zhang #else
3685301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
36869371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3687afb2bd1cSJunchao Zhang #endif
3688a65300a6SPaul Mullowney       }
3689aa372e3fSPaul Mullowney     }
36909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3691aa372e3fSPaul Mullowney 
3692e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3693213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3694213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3695995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3696e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3697995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
36987656d835SStefano Zampini         }
3699213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3700995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
37017656d835SStefano Zampini       }
37027656d835SStefano Zampini 
3703213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3704213423ffSJunchao Zhang       if (compressed) {
37059566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3706da81f932SPierre Jolivet         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3707a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3708a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3709a0e72f99SJunchao Zhang          */
3710a0e72f99SJunchao Zhang #if 0
3711a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3712a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3713a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3714e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3715c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3716a0e72f99SJunchao Zhang #else
37176497c311SBarry Smith         PetscInt n = (PetscInt)matstruct->cprowIndices->size();
37186497c311SBarry Smith         ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3719a0e72f99SJunchao Zhang #endif
37209566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3721e6e9a74fSStefano Zampini       }
3722e6e9a74fSStefano Zampini     } else {
3723995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3724e6e9a74fSStefano Zampini     }
37259566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
37269566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
37279566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3728d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3729d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3730d71ae5a4SJacob Faibussowitsch   }
3731e6e9a74fSStefano Zampini   if (yy) {
37329566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3733e6e9a74fSStefano Zampini   } else {
37349566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3735e6e9a74fSStefano Zampini   }
37363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37379ae82921SPaul Mullowney }
37389ae82921SPaul Mullowney 
3739d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3740d71ae5a4SJacob Faibussowitsch {
3741ca45077fSPaul Mullowney   PetscFunctionBegin;
37429566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3744ca45077fSPaul Mullowney }
3745ca45077fSPaul Mullowney 
3746d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3747d71ae5a4SJacob Faibussowitsch {
3748042217e8SBarry Smith   PetscFunctionBegin;
37499566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37519ae82921SPaul Mullowney }
37529ae82921SPaul Mullowney 
3753e057df02SPaul Mullowney /*@
375411a5261eSBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
37552920cce0SJacob Faibussowitsch   (the default parallel PETSc format).
37569ae82921SPaul Mullowney 
3757d083f849SBarry Smith   Collective
37589ae82921SPaul Mullowney 
37599ae82921SPaul Mullowney   Input Parameters:
376011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37619ae82921SPaul Mullowney . m    - number of rows
37629ae82921SPaul Mullowney . n    - number of columns
376320f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
376420f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37659ae82921SPaul Mullowney 
37669ae82921SPaul Mullowney   Output Parameter:
37679ae82921SPaul Mullowney . A - the matrix
37689ae82921SPaul Mullowney 
37692ef1f0ffSBarry Smith   Level: intermediate
37702ef1f0ffSBarry Smith 
37712ef1f0ffSBarry Smith   Notes:
37722920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
37732920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
37742920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
37752920cce0SJacob Faibussowitsch 
377611a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
37779ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
377811a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
37799ae82921SPaul Mullowney 
378011a5261eSBarry Smith   The AIJ format, also called
37812ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
37829ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
378320f4b53cSBarry Smith   either one (as in Fortran) or zero.
37849ae82921SPaul Mullowney 
37859ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
37862ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
378720f4b53cSBarry Smith   allocation.
37889ae82921SPaul Mullowney 
3789fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
37909ae82921SPaul Mullowney @*/
3791d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3792d71ae5a4SJacob Faibussowitsch {
37939ae82921SPaul Mullowney   PetscFunctionBegin;
37949566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
37959566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
37969566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
37979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
37983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37999ae82921SPaul Mullowney }
38009ae82921SPaul Mullowney 
3801d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3802d71ae5a4SJacob Faibussowitsch {
38039ae82921SPaul Mullowney   PetscFunctionBegin;
38049ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
38052c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
38069ae82921SPaul Mullowney   } else {
38079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3808aa372e3fSPaul Mullowney   }
38099566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
38109566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
38119566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
38129566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
38139566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
38149566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
38159566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
38169566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
38179566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
38189566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
38199566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
38203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38219ae82921SPaul Mullowney }
38229ae82921SPaul Mullowney 
3823ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
382495639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3825d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3826d71ae5a4SJacob Faibussowitsch {
38279ff858a8SKarl Rupp   PetscFunctionBegin;
38289566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
38299566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
38303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38319ff858a8SKarl Rupp }
38329ff858a8SKarl Rupp 
3833d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3834d71ae5a4SJacob Faibussowitsch {
3835a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3836039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3837039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3838039c6fbaSStefano Zampini   PetscScalar        *ay;
3839039c6fbaSStefano Zampini   const PetscScalar  *ax;
3840039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3841e6e9a74fSStefano Zampini 
384295639643SRichard Tran Mills   PetscFunctionBegin;
3843a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3844a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3845039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38469566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38479566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38483ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
384995639643SRichard Tran Mills   }
3850039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38519566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38535f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38545f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3855039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3856039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3857039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3858039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3859039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3860ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3861039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3862039c6fbaSStefano Zampini   }
3863d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3864d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3865039c6fbaSStefano Zampini 
3866039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3867039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3868039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3869039c6fbaSStefano Zampini     size_t bufferSize;
3870039c6fbaSStefano Zampini     void  *buffer;
3871039c6fbaSStefano Zampini #endif
3872039c6fbaSStefano Zampini 
38739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38759566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3876039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38779371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38789371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
38799566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
38809566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38819371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38829371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
38839566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38849566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
38859566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3886039c6fbaSStefano Zampini #else
38879566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38889371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38899371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
38909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38919566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3892039c6fbaSStefano Zampini #endif
38939566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
38949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
38959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38969566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3897039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3898a587d139SMark     cublasHandle_t cublasv2handle;
3899a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3900039c6fbaSStefano Zampini 
39019566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
39029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39039566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39049566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
39059566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39069566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
39079566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
39089566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39099566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
39109566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39119566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3912039c6fbaSStefano Zampini   } else {
39139566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
39149566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3915a587d139SMark   }
39163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
391795639643SRichard Tran Mills }
391895639643SRichard Tran Mills 
3919d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3920d71ae5a4SJacob Faibussowitsch {
392133c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
392233c9ba73SStefano Zampini   PetscScalar   *ay;
392333c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
392433c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
392533c9ba73SStefano Zampini 
392633c9ba73SStefano Zampini   PetscFunctionBegin;
39279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
39289566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
39299566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
39309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
39319566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
39329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
39339566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
39349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
39359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
39363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
393733c9ba73SStefano Zampini }
393833c9ba73SStefano Zampini 
3939d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3940d71ae5a4SJacob Faibussowitsch {
39417e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3942a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
39437e8381f9SStefano Zampini 
39443fa6b06aSMark Adams   PetscFunctionBegin;
39453fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39463fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39477e8381f9SStefano Zampini     if (spptr->mat) {
39487e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39497e8381f9SStefano Zampini       if (matrix->values) {
39507e8381f9SStefano Zampini         both = PETSC_TRUE;
39517e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39527e8381f9SStefano Zampini       }
39537e8381f9SStefano Zampini     }
39547e8381f9SStefano Zampini     if (spptr->matTranspose) {
39557e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3956ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39577e8381f9SStefano Zampini     }
39583fa6b06aSMark Adams   }
39599566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
39609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
39617e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3962a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
39633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39643fa6b06aSMark Adams }
39653fa6b06aSMark Adams 
3966d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3967d71ae5a4SJacob Faibussowitsch {
3968a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3969a587d139SMark 
3970a587d139SMark   PetscFunctionBegin;
39719a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
39729a14fc28SStefano Zampini     A->boundtocpu = flg;
39733ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
39749a14fc28SStefano Zampini   }
3975a587d139SMark   if (flg) {
39769566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3977a587d139SMark 
397833c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3979a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3980a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3981a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3982a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3983a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3984a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3985a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3986a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3987fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
39889566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
39899566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
39909566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
39919566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
39929566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
39939566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
39949566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3995a587d139SMark   } else {
399633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3997a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3998a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3999a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4000a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4001a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4002a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4003a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4004a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4005fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
400667a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
400767a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
400867a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
400967a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
401067a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
401167a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
40127ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
40137ee59b9bSJunchao Zhang 
40149566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
40159566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40169566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
40179566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
40189566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
40199566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
4020a587d139SMark   }
4021a587d139SMark   A->boundtocpu = flg;
4022ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
4023ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4024ea500dcfSRichard Tran Mills   } else {
4025ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4026ea500dcfSRichard Tran Mills   }
40273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4028a587d139SMark }
4029a587d139SMark 
40308eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
4031d71ae5a4SJacob Faibussowitsch {
403249735bf3SStefano Zampini   Mat B;
40339ae82921SPaul Mullowney 
40349ae82921SPaul Mullowney   PetscFunctionBegin;
40359566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
403649735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
40379566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
403849735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40399566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
404049735bf3SStefano Zampini   }
404149735bf3SStefano Zampini   B = *newmat;
404249735bf3SStefano Zampini 
40439566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40449566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
404534136279SStefano Zampini 
404649735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40479ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4048e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40499566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40509566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40519566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40521a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4053d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4054b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4055a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4056a435da06SStefano Zampini   #else
4057d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4058a435da06SStefano Zampini   #endif
4059d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4060d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4061d8132acaSStefano Zampini #endif
40621a2c6b5cSJunchao Zhang       B->spptr = spptr;
40639ae82921SPaul Mullowney     } else {
4064e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4065e6e9a74fSStefano Zampini 
40669566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40679566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40689566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4069e6e9a74fSStefano Zampini       B->spptr = spptr;
40709ae82921SPaul Mullowney     }
4071e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
407249735bf3SStefano Zampini   }
4073693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
40749ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
40751a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
40769ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
407795639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4078693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
40792205254eSKarl Rupp 
40809566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
40819566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
40829566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4083ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
40849566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4085ae48a8d0SStefano Zampini #endif
40869566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
40873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40889ae82921SPaul Mullowney }
40899ae82921SPaul Mullowney 
4090d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4091d71ae5a4SJacob Faibussowitsch {
409202fe1965SBarry Smith   PetscFunctionBegin;
40939566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
40949566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
40953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
409602fe1965SBarry Smith }
409702fe1965SBarry Smith 
40983ca39a21SBarry Smith /*MC
4099e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4100e057df02SPaul Mullowney 
410115229ffcSPierre Jolivet    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
410211a5261eSBarry Smith    CSR, ELL, or Hybrid format.
410311a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4104e057df02SPaul Mullowney 
4105e057df02SPaul Mullowney    Options Database Keys:
410611a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
41072ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
41082ef1f0ffSBarry Smith                                       Other options include ell (ellpack) or hyb (hybrid).
41092ef1f0ffSBarry Smith .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
41102ef1f0ffSBarry Smith -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4111e057df02SPaul Mullowney 
4112e057df02SPaul Mullowney   Level: beginner
4113e057df02SPaul Mullowney 
41141cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4115e057df02SPaul Mullowney M*/
41167f756511SDominic Meiser 
4117d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4118d71ae5a4SJacob Faibussowitsch {
411942c9c57cSBarry Smith   PetscFunctionBegin;
41209566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
41219566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
41229566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
41239566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
41243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
412542c9c57cSBarry Smith }
412629b38603SBarry Smith 
41272c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4128d71ae5a4SJacob Faibussowitsch {
41292c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4130cbc6b225SStefano Zampini 
4131cbc6b225SStefano Zampini   PetscFunctionBegin;
41322c4ab24aSJunchao Zhang   if (cusp) {
41332c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
41342c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
41352c4ab24aSJunchao Zhang     delete cusp->workVector;
41362c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
41372c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
41382c4ab24aSJunchao Zhang     delete cusp->coords;
41392c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
41402c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
41417f756511SDominic Meiser   }
41423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41437f756511SDominic Meiser }
41447f756511SDominic Meiser 
4145d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4146d71ae5a4SJacob Faibussowitsch {
41477f756511SDominic Meiser   PetscFunctionBegin;
41487f756511SDominic Meiser   if (*mat) {
41497f756511SDominic Meiser     delete (*mat)->values;
41507f756511SDominic Meiser     delete (*mat)->column_indices;
41517f756511SDominic Meiser     delete (*mat)->row_offsets;
41527f756511SDominic Meiser     delete *mat;
41537f756511SDominic Meiser     *mat = 0;
41547f756511SDominic Meiser   }
41553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41567f756511SDominic Meiser }
41577f756511SDominic Meiser 
4158b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4159d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4160d71ae5a4SJacob Faibussowitsch {
41617f756511SDominic Meiser   PetscFunctionBegin;
41627f756511SDominic Meiser   if (*trifactor) {
41639566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4164261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
41659566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
41669566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
41679566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4168afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
41699566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4170afb2bd1cSJunchao Zhang   #endif
41719566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
41727f756511SDominic Meiser   }
41733ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41747f756511SDominic Meiser }
4175d460d7bfSJunchao Zhang #endif
41767f756511SDominic Meiser 
4177d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4178d71ae5a4SJacob Faibussowitsch {
41797f756511SDominic Meiser   CsrMatrix *mat;
41807f756511SDominic Meiser 
41817f756511SDominic Meiser   PetscFunctionBegin;
41827f756511SDominic Meiser   if (*matstruct) {
41837f756511SDominic Meiser     if ((*matstruct)->mat) {
41847f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4185afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4186afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4187afb2bd1cSJunchao Zhang #else
41887f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
41899566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4190afb2bd1cSJunchao Zhang #endif
41917f756511SDominic Meiser       } else {
41927f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
41933ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
41947f756511SDominic Meiser       }
41957f756511SDominic Meiser     }
41969566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
41977f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
41989566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
41999566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
42009566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4201afb2bd1cSJunchao Zhang 
4202afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4203afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
42049566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4205fe5544b9SJunchao Zhang 
4206afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4207afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
42089566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
42099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
42109566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4211fe5544b9SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0)
4212fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i]));
4213fe5544b9SJunchao Zhang         if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i]));
4214fe5544b9SJunchao Zhang   #endif
4215afb2bd1cSJunchao Zhang       }
4216afb2bd1cSJunchao Zhang     }
4217afb2bd1cSJunchao Zhang #endif
42187f756511SDominic Meiser     delete *matstruct;
42197e8381f9SStefano Zampini     *matstruct = NULL;
42207f756511SDominic Meiser   }
42213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42227f756511SDominic Meiser }
42237f756511SDominic Meiser 
4224d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4225d71ae5a4SJacob Faibussowitsch {
4226da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4227da112707SJunchao Zhang 
42287f756511SDominic Meiser   PetscFunctionBegin;
4229da112707SJunchao Zhang   if (fs) {
4230b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4231da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4232da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4233da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4234da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4235d460d7bfSJunchao Zhang     delete fs->workVector;
4236d460d7bfSJunchao Zhang     fs->workVector = NULL;
4237d460d7bfSJunchao Zhang #endif
4238da112707SJunchao Zhang     delete fs->rpermIndices;
4239da112707SJunchao Zhang     delete fs->cpermIndices;
4240da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4241da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4242da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4243b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4244da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4245da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
424630807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
424730807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4248da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4249d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4250da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4251da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
425212ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4253da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4254da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
425512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4256da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4257da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4258da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4259da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4260da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4261da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4262da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4263da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4264da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4265da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4266da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4267da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4268d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4269d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4270d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
427112ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
427212ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4273da112707SJunchao Zhang #endif
4274ccdfe979SStefano Zampini   }
42753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4276ccdfe979SStefano Zampini }
4277ccdfe979SStefano Zampini 
4278d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4279d71ae5a4SJacob Faibussowitsch {
4280ccdfe979SStefano Zampini   PetscFunctionBegin;
4281ccdfe979SStefano Zampini   if (*trifactors) {
42829566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4283f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
42849566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
42857f756511SDominic Meiser   }
42863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42877f756511SDominic Meiser }
42887e8381f9SStefano Zampini 
42899371c9d4SSatish Balay struct IJCompare {
4290d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4291d71ae5a4SJacob Faibussowitsch   {
42920b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
42930b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
42947e8381f9SStefano Zampini     return false;
42957e8381f9SStefano Zampini   }
42967e8381f9SStefano Zampini };
42977e8381f9SStefano Zampini 
429866976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4299d71ae5a4SJacob Faibussowitsch {
4300a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4301a49f1ed0SStefano Zampini 
4302a49f1ed0SStefano Zampini   PetscFunctionBegin;
4303a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43043ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4305a49f1ed0SStefano Zampini   if (destroy) {
43069566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4307a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4308a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4309a49f1ed0SStefano Zampini   }
43101a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4312a49f1ed0SStefano Zampini }
4313a49f1ed0SStefano Zampini 
431449abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data)
4315d71ae5a4SJacob Faibussowitsch {
431649abdd8aSBarry Smith   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data;
43174d86920dSPierre Jolivet 
43187e8381f9SStefano Zampini   PetscFunctionBegin;
43192c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
43202c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
43212c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
43223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43237e8381f9SStefano Zampini }
4324ed502f03SStefano Zampini 
432566976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4326d71ae5a4SJacob Faibussowitsch {
43272c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
43282c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
43292c4ab24aSJunchao Zhang   PetscInt            *i, *j;
433003e76207SPierre Jolivet   PetscContainer       container_h;
43312c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4332219fbbafSJunchao Zhang 
4333219fbbafSJunchao Zhang   PetscFunctionBegin;
43342c4ab24aSJunchao Zhang   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
43359566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
43362c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
43372c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
43382c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
43392c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43402c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43412c4ab24aSJunchao Zhang   } else {
43422c4ab24aSJunchao Zhang     i = coo_i;
43432c4ab24aSJunchao Zhang     j = coo_j;
4344219fbbafSJunchao Zhang   }
4345219fbbafSJunchao Zhang 
43462c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43472c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4348cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43492c4ab24aSJunchao Zhang   // Create the GPU memory
43509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43512c4ab24aSJunchao Zhang 
43522c4ab24aSJunchao Zhang   // Copy the COO struct to device
43532c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
43542c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
43552c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
43562c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
43572c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
43582c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
43592c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
43602c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
43612c4ab24aSJunchao Zhang 
43622c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
436303e76207SPierre Jolivet   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
43643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4365219fbbafSJunchao Zhang }
4366219fbbafSJunchao Zhang 
4367d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4368d71ae5a4SJacob Faibussowitsch {
4369219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4370219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4371b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4372b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4373b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4374b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4375b6c38306SJunchao Zhang   }
4376219fbbafSJunchao Zhang }
4377219fbbafSJunchao Zhang 
437866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4379d71ae5a4SJacob Faibussowitsch {
4380219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4381219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4382219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4383219fbbafSJunchao Zhang   PetscMemType         memtype;
4384219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4385219fbbafSJunchao Zhang   PetscScalar         *Aa;
43862c4ab24aSJunchao Zhang   PetscContainer       container;
43872c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4388219fbbafSJunchao Zhang 
4389219fbbafSJunchao Zhang   PetscFunctionBegin;
43902c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
43912c4ab24aSJunchao Zhang 
43922c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
43932c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
43942c4ab24aSJunchao Zhang 
43959566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4396219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
43972c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
43982c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4399219fbbafSJunchao Zhang   }
4400219fbbafSJunchao Zhang 
44019566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44029566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4403219fbbafSJunchao Zhang 
440408bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4405cbc6b225SStefano Zampini   if (Annz) {
44066497c311SBarry Smith     MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
44079566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4408cbc6b225SStefano Zampini   }
440908bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4410219fbbafSJunchao Zhang 
44119566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44129566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4413219fbbafSJunchao Zhang 
44149566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
44153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4416219fbbafSJunchao Zhang }
4417219fbbafSJunchao Zhang 
44185b7e41feSStefano Zampini /*@C
44192ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
44205b7e41feSStefano Zampini 
44212ef1f0ffSBarry Smith   Not Collective
44225b7e41feSStefano Zampini 
44235b7e41feSStefano Zampini   Input Parameters:
44245b7e41feSStefano Zampini + A          - the matrix
442511a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44265b7e41feSStefano Zampini 
44275b7e41feSStefano Zampini   Output Parameters:
442820f4b53cSBarry Smith + i - the CSR row pointers
442920f4b53cSBarry Smith - j - the CSR column indices
44305b7e41feSStefano Zampini 
44315b7e41feSStefano Zampini   Level: developer
44325b7e41feSStefano Zampini 
443311a5261eSBarry Smith   Note:
44345b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
44355b7e41feSStefano Zampini 
44361cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44375b7e41feSStefano Zampini @*/
4438d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4439d71ae5a4SJacob Faibussowitsch {
44405f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44415f101d05SStefano Zampini   CsrMatrix          *csr;
44425f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44435f101d05SStefano Zampini 
44445f101d05SStefano Zampini   PetscFunctionBegin;
44455f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44463ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44475f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4448aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
445028b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44515f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44525f101d05SStefano Zampini   if (i) {
44535f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
44545f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
44555f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
44565f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
44579566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
44585f101d05SStefano Zampini       }
44595f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
44605f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
44615f101d05SStefano Zampini   }
44625f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
44633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44645f101d05SStefano Zampini }
44655f101d05SStefano Zampini 
44665b7e41feSStefano Zampini /*@C
44672ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
44685b7e41feSStefano Zampini 
44692ef1f0ffSBarry Smith   Not Collective
44705b7e41feSStefano Zampini 
44715b7e41feSStefano Zampini   Input Parameters:
44725b7e41feSStefano Zampini + A          - the matrix
44732ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
447420f4b53cSBarry Smith . i          - the CSR row pointers
447520f4b53cSBarry Smith - j          - the CSR column indices
44765b7e41feSStefano Zampini 
44775b7e41feSStefano Zampini   Level: developer
44785b7e41feSStefano Zampini 
44791cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
44805b7e41feSStefano Zampini @*/
448120f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4482d71ae5a4SJacob Faibussowitsch {
44835f101d05SStefano Zampini   PetscFunctionBegin;
44845f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44855f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44865f101d05SStefano Zampini   if (i) *i = NULL;
44875f101d05SStefano Zampini   if (j) *j = NULL;
448820f4b53cSBarry Smith   (void)compressed;
44893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44905f101d05SStefano Zampini }
44915f101d05SStefano Zampini 
44925b7e41feSStefano Zampini /*@C
449311a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
44945b7e41feSStefano Zampini 
44955b7e41feSStefano Zampini   Not Collective
44965b7e41feSStefano Zampini 
44975b7e41feSStefano Zampini   Input Parameter:
449811a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
44995b7e41feSStefano Zampini 
45005b7e41feSStefano Zampini   Output Parameter:
45015b7e41feSStefano Zampini . a - pointer to the device data
45025b7e41feSStefano Zampini 
45035b7e41feSStefano Zampini   Level: developer
45045b7e41feSStefano Zampini 
450511a5261eSBarry Smith   Note:
450611a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
45075b7e41feSStefano Zampini 
45081cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45095b7e41feSStefano Zampini @*/
4510d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4511d71ae5a4SJacob Faibussowitsch {
4512ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4513ed502f03SStefano Zampini   CsrMatrix          *csr;
4514ed502f03SStefano Zampini 
4515ed502f03SStefano Zampini   PetscFunctionBegin;
4516ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45174f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4518ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4519aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
452128b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4522ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
452328b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4524ed502f03SStefano Zampini   *a = csr->values->data().get();
45253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4526ed502f03SStefano Zampini }
4527ed502f03SStefano Zampini 
45285b7e41feSStefano Zampini /*@C
452911a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45305b7e41feSStefano Zampini 
45315b7e41feSStefano Zampini   Not Collective
45325b7e41feSStefano Zampini 
45332ef1f0ffSBarry Smith   Input Parameters:
45342ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45352ef1f0ffSBarry Smith - a - pointer to the device data
45365b7e41feSStefano Zampini 
45375b7e41feSStefano Zampini   Level: developer
45385b7e41feSStefano Zampini 
45391cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
45405b7e41feSStefano Zampini @*/
4541d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4542d71ae5a4SJacob Faibussowitsch {
4543ed502f03SStefano Zampini   PetscFunctionBegin;
4544ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45454f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4546ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4547ed502f03SStefano Zampini   *a = NULL;
45483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4549ed502f03SStefano Zampini }
4550ed502f03SStefano Zampini 
45515b7e41feSStefano Zampini /*@C
455211a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45535b7e41feSStefano Zampini 
45545b7e41feSStefano Zampini   Not Collective
45555b7e41feSStefano Zampini 
45565b7e41feSStefano Zampini   Input Parameter:
455711a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45585b7e41feSStefano Zampini 
45595b7e41feSStefano Zampini   Output Parameter:
45605b7e41feSStefano Zampini . a - pointer to the device data
45615b7e41feSStefano Zampini 
45625b7e41feSStefano Zampini   Level: developer
45635b7e41feSStefano Zampini 
456411a5261eSBarry Smith   Note:
456511a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
45665b7e41feSStefano Zampini 
45671cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
45685b7e41feSStefano Zampini @*/
4569d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4570d71ae5a4SJacob Faibussowitsch {
4571039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4572039c6fbaSStefano Zampini   CsrMatrix          *csr;
4573039c6fbaSStefano Zampini 
4574039c6fbaSStefano Zampini   PetscFunctionBegin;
4575039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45764f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4577039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4578aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
458028b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4581039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
458228b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4583039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4584039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
45863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4587039c6fbaSStefano Zampini }
45885b7e41feSStefano Zampini /*@C
458911a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4590039c6fbaSStefano Zampini 
45915b7e41feSStefano Zampini   Not Collective
45925b7e41feSStefano Zampini 
45932ef1f0ffSBarry Smith   Input Parameters:
45942ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45952ef1f0ffSBarry Smith - a - pointer to the device data
45965b7e41feSStefano Zampini 
45975b7e41feSStefano Zampini   Level: developer
45985b7e41feSStefano Zampini 
45991cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
46005b7e41feSStefano Zampini @*/
4601d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4602d71ae5a4SJacob Faibussowitsch {
4603039c6fbaSStefano Zampini   PetscFunctionBegin;
4604039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46054f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4606039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46089566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4609039c6fbaSStefano Zampini   *a = NULL;
46103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4611039c6fbaSStefano Zampini }
4612039c6fbaSStefano Zampini 
46135b7e41feSStefano Zampini /*@C
461411a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46155b7e41feSStefano Zampini 
46165b7e41feSStefano Zampini   Not Collective
46175b7e41feSStefano Zampini 
46185b7e41feSStefano Zampini   Input Parameter:
461911a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
46205b7e41feSStefano Zampini 
46215b7e41feSStefano Zampini   Output Parameter:
46225b7e41feSStefano Zampini . a - pointer to the device data
46235b7e41feSStefano Zampini 
46245b7e41feSStefano Zampini   Level: developer
46255b7e41feSStefano Zampini 
462611a5261eSBarry Smith   Note:
462711a5261eSBarry Smith   Does not trigger host-device copies and flags data validity on the GPU
46285b7e41feSStefano Zampini 
46291cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46305b7e41feSStefano Zampini @*/
4631d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4632d71ae5a4SJacob Faibussowitsch {
4633ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4634ed502f03SStefano Zampini   CsrMatrix          *csr;
4635ed502f03SStefano Zampini 
4636ed502f03SStefano Zampini   PetscFunctionBegin;
4637ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46384f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4639ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4640aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
464128b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4642ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
464328b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4644ed502f03SStefano Zampini   *a             = csr->values->data().get();
4645039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4648ed502f03SStefano Zampini }
4649ed502f03SStefano Zampini 
46505b7e41feSStefano Zampini /*@C
465111a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46525b7e41feSStefano Zampini 
46535b7e41feSStefano Zampini   Not Collective
46545b7e41feSStefano Zampini 
46552ef1f0ffSBarry Smith   Input Parameters:
46562ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46572ef1f0ffSBarry Smith - a - pointer to the device data
46585b7e41feSStefano Zampini 
46595b7e41feSStefano Zampini   Level: developer
46605b7e41feSStefano Zampini 
46611cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
46625b7e41feSStefano Zampini @*/
4663d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4664d71ae5a4SJacob Faibussowitsch {
4665ed502f03SStefano Zampini   PetscFunctionBegin;
4666ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46674f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4668ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46709566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4671ed502f03SStefano Zampini   *a = NULL;
46723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4673ed502f03SStefano Zampini }
4674ed502f03SStefano Zampini 
46759371c9d4SSatish Balay struct IJCompare4 {
4676d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4677d71ae5a4SJacob Faibussowitsch   {
46780b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
46790b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4680ed502f03SStefano Zampini     return false;
4681ed502f03SStefano Zampini   }
4682ed502f03SStefano Zampini };
4683ed502f03SStefano Zampini 
46849371c9d4SSatish Balay struct Shift {
4685ed502f03SStefano Zampini   int _shift;
4686ed502f03SStefano Zampini 
4687ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
46889371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4689ed502f03SStefano Zampini };
4690ed502f03SStefano Zampini 
469121afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4692d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4693d71ae5a4SJacob Faibussowitsch {
4694ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4695ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4696ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4697ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4698ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4699ed502f03SStefano Zampini   cusparseStatus_t              stat;
4700ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4701ed502f03SStefano Zampini 
4702ed502f03SStefano Zampini   PetscFunctionBegin;
4703ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4704ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
47054f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4706ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4707ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47085f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
470908401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4710aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4711aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4712ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4713ed502f03SStefano Zampini     m = A->rmap->n;
4714ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47159566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47169566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47179566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4718ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4719ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4720ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4721ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4722ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4723ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4724ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4725ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4726ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4727ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4728ed502f03SStefano Zampini     Ccusp->nrows            = m;
4729ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4730ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4731ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4732ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47339566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47349566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47359566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4736f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4737f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4738f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
47399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47439566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
474428b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
474528b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4746ed502f03SStefano Zampini 
4747ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4748ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4749ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4750ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4751ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4752ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4753ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4754ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4755ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
47562c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4757ed502f03SStefano Zampini     if (c->nz) {
47582ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
47592ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
47602ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
47612ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
47622ed87e7eSStefano Zampini 
4763ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4764ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4765ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4766ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
47679566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4768ed502f03SStefano Zampini         }
47692ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
47702ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4771ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4772ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4773ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4774ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
47759566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4776ed502f03SStefano Zampini         }
47772ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
47782ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
47799566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
47809371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47819371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47829371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47839371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47842ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
47852ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
47862ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
47878909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4788ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4789ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
47908909a122SStefano Zampini #else
47918909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
47928909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
47938909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
47948909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
47958909a122SStefano Zampini #endif
47962ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
47972ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
47982ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
47992ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48002ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48012ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
48022c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
48032c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
4804ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4805792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48068909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48078909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48088909a122SStefano Zampini #endif
48092ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48102ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48112ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4812792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48132ed87e7eSStefano Zampini #else
48142ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4815792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4816792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48172ed87e7eSStefano Zampini #endif
48189371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48199371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48209566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48212ed87e7eSStefano Zampini       delete wPerm;
48222ed87e7eSStefano Zampini       delete Acoo;
48232ed87e7eSStefano Zampini       delete Bcoo;
48242ed87e7eSStefano Zampini       delete Ccoo;
4825ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48269371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48279371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4828ed502f03SStefano Zampini #endif
48291a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48309566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48319566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4832ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4833ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4834ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4835ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4836ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4837ed502f03SStefano Zampini 
48381a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48391a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4840a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4841ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4842ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4843ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4844ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4845ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4846ed502f03SStefano Zampini 
4847ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4848ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4849ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4850ed502f03SStefano Zampini 
48519566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4852ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4853ed502f03SStefano Zampini         if (AT) {
4854ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4855ed502f03SStefano Zampini           thrust::advance(rT, -1);
4856ed502f03SStefano Zampini         }
4857ed502f03SStefano Zampini         if (BT) {
4858ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4859ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4860ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4861ed502f03SStefano Zampini         }
4862ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4863ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4864ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4865ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4866ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4867ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48689566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4869ed502f03SStefano Zampini 
48709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
48719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
48729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4873f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4874f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4875f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
48769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48779566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4879ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48809371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48819371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4882ed502f03SStefano Zampini #endif
4883ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4884ed502f03SStefano Zampini       }
4885ed502f03SStefano Zampini     }
4886ed502f03SStefano Zampini 
4887ed502f03SStefano Zampini     c->free_a = PETSC_TRUE;
48889f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j));
48899f0612e4SBarry Smith     PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i));
4890ed502f03SStefano Zampini     c->free_ij = PETSC_TRUE;
48917de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4892ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4893ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4894ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4895ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
48969566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
48979566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4898ed502f03SStefano Zampini     } else {
48999566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49009566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4901ed502f03SStefano Zampini     }
49029566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49039566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49049566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4905ed502f03SStefano Zampini     c->maxnz         = c->nz;
4906ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4907ed502f03SStefano Zampini     c->rmax          = 0;
4908ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4909ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4910ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4911ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4912ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4913ed502f03SStefano Zampini     }
49149566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
49159566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4916ed502f03SStefano Zampini     (*C)->nonzerostate++;
49179566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49189566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4919ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4920ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4921ed502f03SStefano Zampini   } else {
492208401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4923ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4924ed502f03SStefano Zampini     if (c->nz) {
4925ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49262c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4927aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
492808401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49299566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49309566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49315f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49325f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4933ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4934ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4935ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4936aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4937aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4938aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4939aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49402c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
49412c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
4942ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
49439566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49442c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
49459371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4946ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49479371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
49482c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4949ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
49509566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
49511a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
49525f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4953ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4954ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4955ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4956ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4957ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4958ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4959ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49601a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4961ed502f03SStefano Zampini       }
49629566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4963ed502f03SStefano Zampini     }
4964ed502f03SStefano Zampini   }
49659566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4966ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4967ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4968ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
49693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4970ed502f03SStefano Zampini }
4971c215019aSStefano Zampini 
4972d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4973d71ae5a4SJacob Faibussowitsch {
4974c215019aSStefano Zampini   bool               dmem;
4975c215019aSStefano Zampini   const PetscScalar *av;
4976c215019aSStefano Zampini 
4977c215019aSStefano Zampini   PetscFunctionBegin;
4978c215019aSStefano Zampini   dmem = isCudaMem(v);
49799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4980c215019aSStefano Zampini   if (n && idx) {
4981c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4982c215019aSStefano Zampini     widx.assign(idx, idx + n);
49839566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4984c215019aSStefano Zampini 
4985c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4986c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4987c215019aSStefano Zampini     if (dmem) {
4988c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4989c215019aSStefano Zampini     } else {
4990c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4991c215019aSStefano Zampini       dv = w->data();
4992c215019aSStefano Zampini     }
4993c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4994c215019aSStefano Zampini 
4995c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4996c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4997c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
499848a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4999c215019aSStefano Zampini     delete w;
5000c215019aSStefano Zampini   } else {
50019566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5002c215019aSStefano Zampini   }
50039566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
50053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5006c215019aSStefano Zampini }
5007b0c00012SPierre Jolivet PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END()
5008