xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision b917901dc2036dfc8a5a86cdcebb806de4b13a00)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
19d0967f54SJacob Faibussowitsch #endif
20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
21a2cee5feSJed Brown #include <thrust/remove.h>
22a2cee5feSJed Brown #include <thrust/sort.h>
23a2cee5feSJed Brown #include <thrust/unique.h>
24e8d2b73aSMark Adams 
25e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
35afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
36afb2bd1cSJunchao Zhang 
37afb2bd1cSJunchao Zhang   typedef enum {
38afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
50afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
51afb2bd1cSJunchao Zhang 
52afb2bd1cSJunchao Zhang   typedef enum {
5335cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
5435cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
56afb2bd1cSJunchao Zhang   */
57afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60afb2bd1cSJunchao Zhang #endif
619ae82921SPaul Mullowney 
62087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
656fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72d460d7bfSJunchao Zhang #endif
73dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7533c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
766fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
786fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
839ae82921SPaul Mullowney 
847f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
887f756511SDominic Meiser 
8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9157181aedSStefano Zampini 
92c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95c215019aSStefano Zampini 
96d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97d71ae5a4SJacob Faibussowitsch {
98aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
996e111a19SKarl Rupp 
100ca45077fSPaul Mullowney   PetscFunctionBegin;
101ca45077fSPaul Mullowney   switch (op) {
102d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
103d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
104d71ae5a4SJacob Faibussowitsch     break;
105d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
106d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
107d71ae5a4SJacob Faibussowitsch     break;
108d71ae5a4SJacob Faibussowitsch   default:
109d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110ca45077fSPaul Mullowney   }
1113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
112ca45077fSPaul Mullowney }
1139ae82921SPaul Mullowney 
114e057df02SPaul Mullowney /*@
11511a5261eSBarry Smith    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11611a5261eSBarry Smith    operation. Only the `MatMult()` operation can use different GPU storage formats
11711a5261eSBarry Smith 
118e057df02SPaul Mullowney    Not Collective
119e057df02SPaul Mullowney 
120e057df02SPaul Mullowney    Input Parameters:
12111a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
1222ef1f0ffSBarry Smith .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
1232ef1f0ffSBarry Smith         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12411a5261eSBarry Smith -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125e057df02SPaul Mullowney 
126e057df02SPaul Mullowney    Level: intermediate
127e057df02SPaul Mullowney 
1282ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129e057df02SPaul Mullowney @*/
130d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131d71ae5a4SJacob Faibussowitsch {
132e057df02SPaul Mullowney   PetscFunctionBegin;
133e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
136e057df02SPaul Mullowney }
137e057df02SPaul Mullowney 
138d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139d71ae5a4SJacob Faibussowitsch {
140365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141365b711fSMark Adams 
142365b711fSMark Adams   PetscFunctionBegin;
143365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145365b711fSMark Adams }
146365b711fSMark Adams 
147365b711fSMark Adams /*@
14811a5261eSBarry Smith    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149365b711fSMark Adams 
150365b711fSMark Adams    Input Parameters:
15111a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
15211a5261eSBarry Smith -  use_cpu - set flag for using the built-in CPU `MatSolve()`
153365b711fSMark Adams 
1542ef1f0ffSBarry Smith    Level: intermediate
155365b711fSMark Adams 
15611a5261eSBarry Smith    Note:
157365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
158365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160365b711fSMark Adams 
1612ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162365b711fSMark Adams @*/
163d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164d71ae5a4SJacob Faibussowitsch {
165365b711fSMark Adams   PetscFunctionBegin;
166365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
169365b711fSMark Adams }
170365b711fSMark Adams 
171d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172d71ae5a4SJacob Faibussowitsch {
173e6e9a74fSStefano Zampini   PetscFunctionBegin;
1741a2c6b5cSJunchao Zhang   switch (op) {
1751a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1761a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1779566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1781a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1791a2c6b5cSJunchao Zhang     break;
180d71ae5a4SJacob Faibussowitsch   default:
181d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182d71ae5a4SJacob Faibussowitsch     break;
183e6e9a74fSStefano Zampini   }
1843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
185e6e9a74fSStefano Zampini }
186e6e9a74fSStefano Zampini 
187d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188d71ae5a4SJacob Faibussowitsch {
189e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1909ae82921SPaul Mullowney   PetscBool                flg;
191a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1926e111a19SKarl Rupp 
1939ae82921SPaul Mullowney   PetscFunctionBegin;
194d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1959ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1969371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1979566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198afb2bd1cSJunchao Zhang 
1999371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2009566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2019566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2029566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2049371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206*b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208a435da06SStefano Zampini   #else
209aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210a435da06SStefano Zampini   #endif
2119371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213afb2bd1cSJunchao Zhang 
2149371c9d4SSatish Balay     PetscCall(
2159371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217afb2bd1cSJunchao Zhang #endif
2184c87dfd4SPaul Mullowney   }
219d0609cedSBarry Smith   PetscOptionsHeadEnd();
2203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2219ae82921SPaul Mullowney }
2229ae82921SPaul Mullowney 
223*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225d460d7bfSJunchao Zhang {
226d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
228d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
231d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
232d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
233d460d7bfSJunchao Zhang 
234d460d7bfSJunchao Zhang   PetscFunctionBegin;
235d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238d460d7bfSJunchao Zhang       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
240d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
242d460d7bfSJunchao Zhang       Mi[0] = 0;
243d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
244d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
245d460d7bfSJunchao Zhang         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
250d460d7bfSJunchao Zhang       }
251d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
252d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
253d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
254d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
255d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*(fs->csrRowPtr)) * (m + 1), cudaMemcpyHostToDevice));
256d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*(fs->csrColIdx)) * Mnz, cudaMemcpyHostToDevice));
257d460d7bfSJunchao Zhang 
258d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266d460d7bfSJunchao Zhang 
267d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270d460d7bfSJunchao Zhang 
271d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
272d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276d460d7bfSJunchao Zhang 
277d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
278d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
279d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
280d460d7bfSJunchao Zhang 
281d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283d460d7bfSJunchao Zhang 
284d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291d460d7bfSJunchao Zhang 
292d460d7bfSJunchao Zhang       // Record for reuse
293d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
294d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
295d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
296d460d7bfSJunchao Zhang     }
297d460d7bfSJunchao Zhang     // Copy the value
298d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
299d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
300d460d7bfSJunchao Zhang     Mnz = Mi[m];
301d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
302d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
303d460d7bfSJunchao Zhang       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305d460d7bfSJunchao Zhang       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307d460d7bfSJunchao Zhang     }
308d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309d460d7bfSJunchao Zhang 
310d460d7bfSJunchao Zhang     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312d460d7bfSJunchao Zhang 
313d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314d460d7bfSJunchao Zhang 
315d460d7bfSJunchao Zhang     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317d460d7bfSJunchao Zhang   }
318d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
319d460d7bfSJunchao Zhang }
320d460d7bfSJunchao Zhang #else
321d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322d71ae5a4SJacob Faibussowitsch {
3239ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3249ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3259ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3279ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3289ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3299ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3309ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3319ae82921SPaul Mullowney 
3329ae82921SPaul Mullowney   PetscFunctionBegin;
3333ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3359ae82921SPaul Mullowney     try {
3369ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3379ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
338da79fbbcSStefano Zampini       if (!loTriFactor) {
3392cbc15d9SMark         PetscScalar *AALo;
3402cbc15d9SMark 
3419566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3429ae82921SPaul Mullowney 
3439ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3449566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3459566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3469ae82921SPaul Mullowney 
3479ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3489ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3499ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3509ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3519ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3529ae82921SPaul Mullowney         v         = aa;
3539ae82921SPaul Mullowney         vi        = aj;
3549ae82921SPaul Mullowney         offset    = 1;
3559ae82921SPaul Mullowney         rowOffset = 1;
3569ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3579ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
358e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3599ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3609ae82921SPaul Mullowney           rowOffset += nz + 1;
3619ae82921SPaul Mullowney 
3629566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
3639566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
3649ae82921SPaul Mullowney 
3659ae82921SPaul Mullowney           offset += nz;
3669ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3679ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3689ae82921SPaul Mullowney           offset += 1;
3699ae82921SPaul Mullowney 
3709ae82921SPaul Mullowney           v += nz;
3719ae82921SPaul Mullowney           vi += nz;
3729ae82921SPaul Mullowney         }
3732205254eSKarl Rupp 
374aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3759566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
376da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377aa372e3fSPaul Mullowney         /* Create the matrix description */
3789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3801b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382afb2bd1cSJunchao Zhang   #else
3839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384afb2bd1cSJunchao Zhang   #endif
3859566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387aa372e3fSPaul Mullowney 
388aa372e3fSPaul Mullowney         /* set the operation */
389aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390aa372e3fSPaul Mullowney 
391aa372e3fSPaul Mullowney         /* set the matrix */
392aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
393aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
394aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
395aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
396aa372e3fSPaul Mullowney 
397aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399aa372e3fSPaul Mullowney 
400aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402aa372e3fSPaul Mullowney 
403aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405aa372e3fSPaul Mullowney 
406afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4079566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
4091b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4109371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4119371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
4129566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413afb2bd1cSJunchao Zhang   #endif
414afb2bd1cSJunchao Zhang 
415aa372e3fSPaul Mullowney         /* perform the solve analysis */
4169371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4179f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4189566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4199566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420aa372e3fSPaul Mullowney 
421da79fbbcSStefano Zampini         /* assign the pointer */
422aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4232cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4269566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427da79fbbcSStefano Zampini       } else { /* update values only */
42848a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4302cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
431da79fbbcSStefano Zampini         v                    = aa;
432da79fbbcSStefano Zampini         vi                   = aj;
433da79fbbcSStefano Zampini         offset               = 1;
434da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
435da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
4369566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
437da79fbbcSStefano Zampini           offset += nz;
4382cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
439da79fbbcSStefano Zampini           offset += 1;
440da79fbbcSStefano Zampini           v += nz;
441da79fbbcSStefano Zampini         }
4422cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4439566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444da79fbbcSStefano Zampini       }
445d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
446d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447d71ae5a4SJacob Faibussowitsch     }
4489ae82921SPaul Mullowney   }
4493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4509ae82921SPaul Mullowney }
4519ae82921SPaul Mullowney 
452d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453d71ae5a4SJacob Faibussowitsch {
4549ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4559ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4569ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4589ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4599ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4609ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4619ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4629ae82921SPaul Mullowney 
4639ae82921SPaul Mullowney   PetscFunctionBegin;
4643ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4669ae82921SPaul Mullowney     try {
4679ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4689ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
469da79fbbcSStefano Zampini       if (!upTriFactor) {
4702cbc15d9SMark         PetscScalar *AAUp;
4712cbc15d9SMark 
4729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4732cbc15d9SMark 
4749ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4779ae82921SPaul Mullowney 
4789ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4799ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4809ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4819ae82921SPaul Mullowney         offset  = nzUpper;
4829ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4839ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4849ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4859ae82921SPaul Mullowney 
486e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4879ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4889ae82921SPaul Mullowney 
489e057df02SPaul Mullowney           /* decrement the offset */
4909ae82921SPaul Mullowney           offset -= (nz + 1);
4919ae82921SPaul Mullowney 
492e057df02SPaul Mullowney           /* first, set the diagonal elements */
4939ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
49409f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4959ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4969ae82921SPaul Mullowney 
4979566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4989566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4999ae82921SPaul Mullowney         }
5002205254eSKarl Rupp 
501aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5029566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
503da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5042205254eSKarl Rupp 
505aa372e3fSPaul Mullowney         /* Create the matrix description */
5069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5079566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5081b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510afb2bd1cSJunchao Zhang   #else
5119566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512afb2bd1cSJunchao Zhang   #endif
5139566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515aa372e3fSPaul Mullowney 
516aa372e3fSPaul Mullowney         /* set the operation */
517aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518aa372e3fSPaul Mullowney 
519aa372e3fSPaul Mullowney         /* set the matrix */
520aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
521aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
522aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
523aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
524aa372e3fSPaul Mullowney 
525aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527aa372e3fSPaul Mullowney 
528aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530aa372e3fSPaul Mullowney 
531aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533aa372e3fSPaul Mullowney 
534afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5359566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5371b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5389371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5399371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5409566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541afb2bd1cSJunchao Zhang   #endif
542afb2bd1cSJunchao Zhang 
543aa372e3fSPaul Mullowney         /* perform the solve analysis */
5449371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5459f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5469f7ba44dSJacob Faibussowitsch 
5479566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5489566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549aa372e3fSPaul Mullowney 
550da79fbbcSStefano Zampini         /* assign the pointer */
551aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5522cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5559566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556da79fbbcSStefano Zampini       } else {
55748a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
559da79fbbcSStefano Zampini         offset = nzUpper;
560da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
561da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
562da79fbbcSStefano Zampini 
563da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
564da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
565da79fbbcSStefano Zampini 
566da79fbbcSStefano Zampini           /* decrement the offset */
567da79fbbcSStefano Zampini           offset -= (nz + 1);
568da79fbbcSStefano Zampini 
569da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5702cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5719566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
572da79fbbcSStefano Zampini         }
5732cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5749566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575da79fbbcSStefano Zampini       }
576d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
577d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578d71ae5a4SJacob Faibussowitsch     }
5799ae82921SPaul Mullowney   }
5803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5819ae82921SPaul Mullowney }
582d460d7bfSJunchao Zhang #endif
5839ae82921SPaul Mullowney 
584d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585d71ae5a4SJacob Faibussowitsch {
5869ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5879ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5889ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5899ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5909ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5919ae82921SPaul Mullowney 
5929ae82921SPaul Mullowney   PetscFunctionBegin;
59328b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596d460d7bfSJunchao Zhang #else
5979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600d460d7bfSJunchao Zhang #endif
601d460d7bfSJunchao Zhang 
602aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
6039ae82921SPaul Mullowney 
604d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605e057df02SPaul Mullowney   /* lower triangular indices */
6069566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
607da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608da79fbbcSStefano Zampini     const PetscInt *r;
609da79fbbcSStefano Zampini 
6109566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
611aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
6139566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
6149566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615da79fbbcSStefano Zampini   }
6169ae82921SPaul Mullowney 
617e057df02SPaul Mullowney   /* upper triangular indices */
6189566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
619da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620da79fbbcSStefano Zampini     const PetscInt *c;
621da79fbbcSStefano Zampini 
6229566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
623aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
6259566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
6269566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627da79fbbcSStefano Zampini   }
6283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6299ae82921SPaul Mullowney }
6309ae82921SPaul Mullowney 
631*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633d460d7bfSJunchao Zhang {
634d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
636d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
639d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
640d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
641d460d7bfSJunchao Zhang 
642d460d7bfSJunchao Zhang   PetscFunctionBegin;
643d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
648d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
649d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
652d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
653d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
654d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655d460d7bfSJunchao Zhang       }
656d460d7bfSJunchao Zhang       // Copy M (U) from host to device
657d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*(fs->csrRowPtr)) * (m + 1)));
658d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*(fs->csrColIdx)) * Mnz));
659d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*(fs->csrVal)) * Mnz));
660d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*(fs->diag)) * m));
661d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663d460d7bfSJunchao Zhang 
664d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672d460d7bfSJunchao Zhang 
673d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676d460d7bfSJunchao Zhang 
677d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
678d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*(fs->X)) * m));
679d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*(fs->Y)) * m));
680d460d7bfSJunchao Zhang 
681d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683d460d7bfSJunchao Zhang 
684d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
685d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688d460d7bfSJunchao Zhang 
689d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut sovle uses the same matrix (spMatDescr_U), but different descr and buffer
690d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692d460d7bfSJunchao Zhang 
693d460d7bfSJunchao Zhang       // Record for reuse
694d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
695d460d7bfSJunchao Zhang       fs->diag_h   = D;
696d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
697d460d7bfSJunchao Zhang     }
698d460d7bfSJunchao Zhang     // Copy the value
699d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
700d460d7bfSJunchao Zhang     D   = fs->diag_h;
701d460d7bfSJunchao Zhang     Mnz = Ai[m];
702d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
703d460d7bfSJunchao Zhang       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706d460d7bfSJunchao Zhang     }
707d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709d460d7bfSJunchao Zhang 
710d460d7bfSJunchao Zhang     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713d460d7bfSJunchao Zhang   }
714d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
715d460d7bfSJunchao Zhang }
716d460d7bfSJunchao Zhang 
717d460d7bfSJunchao Zhang // Solve Ut D U x = b
718d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719d460d7bfSJunchao Zhang {
720d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
723d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
724d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
725d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
726d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
728d460d7bfSJunchao Zhang 
729d460d7bfSJunchao Zhang   PetscFunctionBegin;
730d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
731d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
733d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
734d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
735d460d7bfSJunchao Zhang 
736d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
738d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740d460d7bfSJunchao Zhang   } else {
741d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742d460d7bfSJunchao Zhang   }
743d460d7bfSJunchao Zhang 
744d460d7bfSJunchao Zhang   // Solve Ut Y = X
745d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747d460d7bfSJunchao Zhang 
748d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
750d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751d460d7bfSJunchao Zhang 
752d460d7bfSJunchao Zhang   // Solve U X = Y
753d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755d460d7bfSJunchao Zhang   } else {
756d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757d460d7bfSJunchao Zhang   }
758d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759d460d7bfSJunchao Zhang 
760d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
761d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
762d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764d460d7bfSJunchao Zhang   }
765d460d7bfSJunchao Zhang 
766d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
769d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
771d460d7bfSJunchao Zhang }
772d460d7bfSJunchao Zhang #else
773d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774d71ae5a4SJacob Faibussowitsch {
775087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
780087f3262SPaul Mullowney   PetscScalar                       *AAUp;
781087f3262SPaul Mullowney   PetscScalar                       *AALo;
782087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
786087f3262SPaul Mullowney 
787087f3262SPaul Mullowney   PetscFunctionBegin;
7883ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790087f3262SPaul Mullowney     try {
7919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
7929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
794087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
7969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797087f3262SPaul Mullowney 
798087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
799087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
800087f3262SPaul Mullowney         AiUp[n] = nzUpper;
801087f3262SPaul Mullowney         offset  = 0;
802087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
803087f3262SPaul Mullowney           /* set the pointers */
804087f3262SPaul Mullowney           v  = aa + ai[i];
805087f3262SPaul Mullowney           vj = aj + ai[i];
806087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807087f3262SPaul Mullowney 
808087f3262SPaul Mullowney           /* first, set the diagonal elements */
809087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
81009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
811087f3262SPaul Mullowney           AiUp[i]      = offset;
81209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
813087f3262SPaul Mullowney 
814087f3262SPaul Mullowney           offset += 1;
815087f3262SPaul Mullowney           if (nz > 0) {
8169566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
8179566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
818087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
819087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
820087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
821087f3262SPaul Mullowney             }
822087f3262SPaul Mullowney             offset += nz;
823087f3262SPaul Mullowney           }
824087f3262SPaul Mullowney         }
825087f3262SPaul Mullowney 
826aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8279566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
828da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829087f3262SPaul Mullowney 
830aa372e3fSPaul Mullowney         /* Create the matrix description */
8319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8331b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835afb2bd1cSJunchao Zhang   #else
8369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837afb2bd1cSJunchao Zhang   #endif
8389566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840087f3262SPaul Mullowney 
841aa372e3fSPaul Mullowney         /* set the matrix */
842aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852aa372e3fSPaul Mullowney 
853aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855aa372e3fSPaul Mullowney 
856afb2bd1cSJunchao Zhang         /* set the operation */
857afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858afb2bd1cSJunchao Zhang 
859afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8609566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8621b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8639371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8649371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866afb2bd1cSJunchao Zhang   #endif
867afb2bd1cSJunchao Zhang 
868aa372e3fSPaul Mullowney         /* perform the solve analysis */
8699371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8709f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8719f7ba44dSJacob Faibussowitsch 
8729566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8739566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874aa372e3fSPaul Mullowney 
875da79fbbcSStefano Zampini         /* assign the pointer */
876aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877aa372e3fSPaul Mullowney 
878aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8799566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
880da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881aa372e3fSPaul Mullowney 
882aa372e3fSPaul Mullowney         /* Create the matrix description */
8839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8851b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887afb2bd1cSJunchao Zhang   #else
8889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889afb2bd1cSJunchao Zhang   #endif
8909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892aa372e3fSPaul Mullowney 
893aa372e3fSPaul Mullowney         /* set the operation */
894aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895aa372e3fSPaul Mullowney 
896aa372e3fSPaul Mullowney         /* set the matrix */
897aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
898aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
899aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
900aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904aa372e3fSPaul Mullowney 
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907aa372e3fSPaul Mullowney 
908aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910aa372e3fSPaul Mullowney 
911afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9129566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9141b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9159371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9169371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918afb2bd1cSJunchao Zhang   #endif
919afb2bd1cSJunchao Zhang 
920aa372e3fSPaul Mullowney         /* perform the solve analysis */
9219371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9229f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9239f7ba44dSJacob Faibussowitsch 
9249566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9259566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926aa372e3fSPaul Mullowney 
927da79fbbcSStefano Zampini         /* assign the pointer */
928aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929087f3262SPaul Mullowney 
9309566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9319566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9329566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
933da79fbbcSStefano Zampini       } else {
934da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
935da79fbbcSStefano Zampini         offset = 0;
936da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
937da79fbbcSStefano Zampini           /* set the pointers */
938da79fbbcSStefano Zampini           v  = aa + ai[i];
939da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940da79fbbcSStefano Zampini 
941da79fbbcSStefano Zampini           /* first, set the diagonal elements */
942da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
943da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
944da79fbbcSStefano Zampini 
945da79fbbcSStefano Zampini           offset += 1;
946da79fbbcSStefano Zampini           if (nz > 0) {
9479566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
948da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
949da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
950da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
951da79fbbcSStefano Zampini             }
952da79fbbcSStefano Zampini             offset += nz;
953da79fbbcSStefano Zampini           }
954da79fbbcSStefano Zampini         }
95528b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
95628b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9599566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960da79fbbcSStefano Zampini       }
9619566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9629566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
963d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
964d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965d71ae5a4SJacob Faibussowitsch     }
966087f3262SPaul Mullowney   }
9673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
968087f3262SPaul Mullowney }
969d460d7bfSJunchao Zhang #endif
970087f3262SPaul Mullowney 
971d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972d71ae5a4SJacob Faibussowitsch {
973087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975087f3262SPaul Mullowney   IS                            ip                 = a->row;
976087f3262SPaul Mullowney   PetscBool                     perm_identity;
977087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
978087f3262SPaul Mullowney 
979087f3262SPaul Mullowney   PetscFunctionBegin;
98028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981d460d7bfSJunchao Zhang 
982*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984d460d7bfSJunchao Zhang #else
9859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987d460d7bfSJunchao Zhang #endif
988aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989aa372e3fSPaul Mullowney 
990da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
991da79fbbcSStefano Zampini 
992087f3262SPaul Mullowney   /* lower triangular indices */
9939566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
994087f3262SPaul Mullowney   if (!perm_identity) {
9954e4bbfaaSStefano Zampini     IS              iip;
996da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
9974e4bbfaaSStefano Zampini 
9989566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
9999566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
10009566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
1001aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10044e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10059566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10069566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10079566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10089566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009da79fbbcSStefano Zampini   }
10103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1011087f3262SPaul Mullowney }
1012087f3262SPaul Mullowney 
1013d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014d71ae5a4SJacob Faibussowitsch {
1015087f3262SPaul Mullowney   PetscFunctionBegin;
10169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10179566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1019d460d7bfSJunchao Zhang 
1020*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023d460d7bfSJunchao Zhang #else
1024087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1025d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026d460d7bfSJunchao Zhang   IS          ip = b->row;
1027d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1028d460d7bfSJunchao Zhang 
10299566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1030087f3262SPaul Mullowney   if (perm_identity) {
1031087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033087f3262SPaul Mullowney   } else {
1034087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036d460d7bfSJunchao Zhang   }
1037d460d7bfSJunchao Zhang #endif
10384e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10394e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1040087f3262SPaul Mullowney 
1041087f3262SPaul Mullowney   /* get the triangular factors */
10429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1044087f3262SPaul Mullowney }
10459ae82921SPaul Mullowney 
1046*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048d71ae5a4SJacob Faibussowitsch {
1049bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1055aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1056aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1057aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1058b175d8bbSPaul Mullowney 
1059bda325fcSPaul Mullowney   PetscFunctionBegin;
1060aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10619566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1062da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063aa372e3fSPaul Mullowney 
1064aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1065aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1066aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10679371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney   /* Create the matrix description */
10719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10729566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10739566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10749566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10759566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076aa372e3fSPaul Mullowney 
1077aa372e3fSPaul Mullowney   /* set the operation */
1078aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079aa372e3fSPaul Mullowney 
1080aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1081aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1082afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10919371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
10929371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10939371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10949566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095afb2bd1cSJunchao Zhang   #endif
1096afb2bd1cSJunchao Zhang 
10979566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10989f7ba44dSJacob Faibussowitsch   {
10999f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11009f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
11019371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11039f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104afb2bd1cSJunchao Zhang   #else
11059f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106afb2bd1cSJunchao Zhang   #endif
11079f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11089f7ba44dSJacob Faibussowitsch   }
11099f7ba44dSJacob Faibussowitsch 
11109566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112aa372e3fSPaul Mullowney 
1113afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11149566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11161b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11179371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11189371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11199566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120afb2bd1cSJunchao Zhang   #endif
1121afb2bd1cSJunchao Zhang 
1122afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11239371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11249f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11259f7ba44dSJacob Faibussowitsch 
11269566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11279566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128aa372e3fSPaul Mullowney 
1129da79fbbcSStefano Zampini   /* assign the pointer */
1130aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131aa372e3fSPaul Mullowney 
1132aa372e3fSPaul Mullowney   /*********************************************/
1133aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1134aa372e3fSPaul Mullowney   /*********************************************/
1135aa372e3fSPaul Mullowney 
1136aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11379566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1138da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139aa372e3fSPaul Mullowney 
1140aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1141aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1142aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11439371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145aa372e3fSPaul Mullowney 
1146aa372e3fSPaul Mullowney   /* Create the matrix description */
11479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11489566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11499566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11509566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11519566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152aa372e3fSPaul Mullowney 
1153aa372e3fSPaul Mullowney   /* set the operation */
1154aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155aa372e3fSPaul Mullowney 
1156aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1157aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1158afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164aa372e3fSPaul Mullowney 
1165aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11679371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11689371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11699371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11709566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171afb2bd1cSJunchao Zhang   #endif
1172afb2bd1cSJunchao Zhang 
11739566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11749f7ba44dSJacob Faibussowitsch   {
11759f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11769f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11779371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11799f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180afb2bd1cSJunchao Zhang   #else
11819f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182afb2bd1cSJunchao Zhang   #endif
11839f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11849f7ba44dSJacob Faibussowitsch   }
1185d49cd2b7SBarry Smith 
11869566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11879566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188aa372e3fSPaul Mullowney 
1189afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11909566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
11921b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11939371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11949371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
11959566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196afb2bd1cSJunchao Zhang   #endif
1197afb2bd1cSJunchao Zhang 
1198afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11995f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
12009371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12019f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202d49cd2b7SBarry Smith 
12039566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12049566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205aa372e3fSPaul Mullowney 
1206da79fbbcSStefano Zampini   /* assign the pointer */
1207aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1209bda325fcSPaul Mullowney }
1210d460d7bfSJunchao Zhang #endif
1211bda325fcSPaul Mullowney 
12129371c9d4SSatish Balay struct PetscScalarToPetscInt {
12139371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214a49f1ed0SStefano Zampini };
1215a49f1ed0SStefano Zampini 
1216d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217d71ae5a4SJacob Faibussowitsch {
1218aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1222aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1223b175d8bbSPaul Mullowney 
1224bda325fcSPaul Mullowney   PetscFunctionBegin;
12259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
122728b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
122908401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12303ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12319566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
123348a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12369566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12389566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12399566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240aa372e3fSPaul Mullowney 
1241b06137fdSPaul Mullowney     /* set alpha and beta */
12429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
12439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
12449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
12459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248b06137fdSPaul Mullowney 
1249aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1251a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1252554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1253554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1254aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1255a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1258a3fdcf43SKarl Rupp 
1259ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
126081902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261afb2bd1cSJunchao Zhang 
1262afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12633606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12649371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12659371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12669371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12673606e59fSJunchao Zhang   #else
12683606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12693606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12703606e59fSJunchao Zhang 
12713606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12723606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12733606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12743606e59fSJunchao Zhang         */
12753606e59fSJunchao Zhang       if (matrixT->num_entries) {
12769371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12779371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12783606e59fSJunchao Zhang 
12793606e59fSJunchao Zhang       } else {
12803606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12813606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
12823606e59fSJunchao Zhang       }
12833606e59fSJunchao Zhang   #endif
1284afb2bd1cSJunchao Zhang #endif
1285aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288afb2bd1cSJunchao Zhang #else
1289aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
129051c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
129151c6d536SStefano Zampini       /* First convert HYB to CSR */
1292aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1293aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1294aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1295aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1298aa372e3fSPaul Mullowney 
12999371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
13009371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1304aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1305aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1306aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1309aa372e3fSPaul Mullowney 
13109371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13119371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13129371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1313aa372e3fSPaul Mullowney 
1314aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1315aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13169566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13179371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13189371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13199371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1320aa372e3fSPaul Mullowney 
1321aa372e3fSPaul Mullowney       /* assign the pointer */
1322aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13231a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1324aa372e3fSPaul Mullowney       /* delete temporaries */
1325aa372e3fSPaul Mullowney       if (tempT) {
1326aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1330087f3262SPaul Mullowney       }
1331aa372e3fSPaul Mullowney       if (temp) {
1332aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1336aa372e3fSPaul Mullowney       }
1337afb2bd1cSJunchao Zhang #endif
1338aa372e3fSPaul Mullowney     }
1339a49f1ed0SStefano Zampini   }
1340a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
134328b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
134428b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
134528b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
134628b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
134728b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
134828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
134928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
135028b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13549566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355a49f1ed0SStefano Zampini     }
1356a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1357a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359a49f1ed0SStefano Zampini 
1360a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1363a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13649371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13659371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13669371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13679566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368a49f1ed0SStefano Zampini #endif
1369a49f1ed0SStefano Zampini 
13701a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13711a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13721a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13731a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13741a2c6b5cSJunchao Zhang 
13751a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13761a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13771a2c6b5cSJunchao Zhang         */
13789371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13809371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13819371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1382a49f1ed0SStefano Zampini #else
13839371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13849371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1385a49f1ed0SStefano Zampini #endif
13861a2c6b5cSJunchao Zhang       } else {
13871a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13881a2c6b5cSJunchao Zhang       }
13891a2c6b5cSJunchao Zhang 
1390a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13939566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394a49f1ed0SStefano Zampini #endif
1395a49f1ed0SStefano Zampini     }
13969371c9d4SSatish Balay     PetscCallThrust(
13979371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398a49f1ed0SStefano Zampini   }
13999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14009566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1402213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1403aa372e3fSPaul Mullowney   /* assign the pointer */
1404aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14051a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1407bda325fcSPaul Mullowney }
1408bda325fcSPaul Mullowney 
1409*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411d460d7bfSJunchao Zhang {
1412d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1413d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1414d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1415d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1416d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1421d460d7bfSJunchao Zhang 
1422d460d7bfSJunchao Zhang   PetscFunctionBegin;
1423d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1424d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1427d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1428d460d7bfSJunchao Zhang 
1429d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1431d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433d460d7bfSJunchao Zhang   } else {
1434d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435d460d7bfSJunchao Zhang   }
1436d460d7bfSJunchao Zhang 
1437d460d7bfSJunchao Zhang   // Solve L Y = X
1438d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441d460d7bfSJunchao Zhang 
1442d460d7bfSJunchao Zhang   // Solve U X = Y
1443d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1444d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445d460d7bfSJunchao Zhang   } else {
1446d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447d460d7bfSJunchao Zhang   }
1448d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449d460d7bfSJunchao Zhang 
1450d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1451d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1452d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454d460d7bfSJunchao Zhang   }
1455d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1458d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1460d460d7bfSJunchao Zhang }
1461d460d7bfSJunchao Zhang 
1462d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463d460d7bfSJunchao Zhang {
1464d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1467d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1468d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1469d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1470d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1473d460d7bfSJunchao Zhang 
1474d460d7bfSJunchao Zhang   PetscFunctionBegin;
1475d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1476d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480d460d7bfSJunchao Zhang 
1481d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486d460d7bfSJunchao Zhang   }
1487d460d7bfSJunchao Zhang 
1488d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1489d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490d460d7bfSJunchao Zhang 
1491d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493d460d7bfSJunchao Zhang   }
1494d460d7bfSJunchao Zhang 
1495d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1498d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1499d460d7bfSJunchao Zhang 
1500d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1502d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504d460d7bfSJunchao Zhang   } else {
1505d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506d460d7bfSJunchao Zhang   }
1507d460d7bfSJunchao Zhang 
1508d460d7bfSJunchao Zhang   // Solve Ut Y = X
1509d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511d460d7bfSJunchao Zhang 
1512d460d7bfSJunchao Zhang   // Solve Lt X = Y
1513d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515d460d7bfSJunchao Zhang   } else {
1516d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517d460d7bfSJunchao Zhang   }
1518d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519d460d7bfSJunchao Zhang 
1520d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1521d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1522d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524d460d7bfSJunchao Zhang   }
1525d460d7bfSJunchao Zhang 
1526d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1529d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1531d460d7bfSJunchao Zhang }
1532d460d7bfSJunchao Zhang #else
1533a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535d71ae5a4SJacob Faibussowitsch {
1536c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1537465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1538465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1539465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1540465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1541bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545bda325fcSPaul Mullowney 
1546bda325fcSPaul Mullowney   PetscFunctionBegin;
1547aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1548aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15499566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552bda325fcSPaul Mullowney   }
1553bda325fcSPaul Mullowney 
1554bda325fcSPaul Mullowney   /* Get the GPU pointers */
15559566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15569566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1558c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1559bda325fcSPaul Mullowney 
15609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1561aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15629371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563aa372e3fSPaul Mullowney 
1564aa372e3fSPaul Mullowney   /* First, solve U */
15659f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15669f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567aa372e3fSPaul Mullowney 
1568aa372e3fSPaul Mullowney   /* Then, solve L */
15699f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15709f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571aa372e3fSPaul Mullowney 
1572aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15739371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574aa372e3fSPaul Mullowney 
1575aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1576a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577bda325fcSPaul Mullowney 
1578bda325fcSPaul Mullowney   /* restore */
15799566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15809566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
15833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1584bda325fcSPaul Mullowney }
1585bda325fcSPaul Mullowney 
1586d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587d71ae5a4SJacob Faibussowitsch {
1588465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1589465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1590bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594bda325fcSPaul Mullowney 
1595bda325fcSPaul Mullowney   PetscFunctionBegin;
1596aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1597aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601bda325fcSPaul Mullowney   }
1602bda325fcSPaul Mullowney 
1603bda325fcSPaul Mullowney   /* Get the GPU pointers */
16049566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16059566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606bda325fcSPaul Mullowney 
16079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1608aa372e3fSPaul Mullowney   /* First, solve U */
16099f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16109f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611aa372e3fSPaul Mullowney 
1612aa372e3fSPaul Mullowney   /* Then, solve L */
16139f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16149f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615bda325fcSPaul Mullowney 
1616bda325fcSPaul Mullowney   /* restore */
16179566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16189566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1622bda325fcSPaul Mullowney }
1623bda325fcSPaul Mullowney 
1624d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625d71ae5a4SJacob Faibussowitsch {
1626465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1627465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1628465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1629465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16349ae82921SPaul Mullowney 
16359ae82921SPaul Mullowney   PetscFunctionBegin;
1636e057df02SPaul Mullowney   /* Get the GPU pointers */
16379566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16389566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1640c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16419ae82921SPaul Mullowney 
16429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1643aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16449371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645aa372e3fSPaul Mullowney 
1646aa372e3fSPaul Mullowney   /* Next, solve L */
16479f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16489f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649aa372e3fSPaul Mullowney 
1650aa372e3fSPaul Mullowney   /* Then, solve U */
16519f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16529f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653d49cd2b7SBarry Smith 
16544e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16559371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16569ae82921SPaul Mullowney 
16579566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16589566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16629ae82921SPaul Mullowney }
16639ae82921SPaul Mullowney 
1664d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665d71ae5a4SJacob Faibussowitsch {
1666465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1667465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16729ae82921SPaul Mullowney 
16739ae82921SPaul Mullowney   PetscFunctionBegin;
1674e057df02SPaul Mullowney   /* Get the GPU pointers */
16759566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16769566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16779ae82921SPaul Mullowney 
16789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1679aa372e3fSPaul Mullowney   /* First, solve L */
16809f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16819f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682d49cd2b7SBarry Smith 
1683aa372e3fSPaul Mullowney   /* Next, solve U */
16849f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16859f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
16869ae82921SPaul Mullowney 
16879566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16889566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16929ae82921SPaul Mullowney }
1693d460d7bfSJunchao Zhang #endif
16949ae82921SPaul Mullowney 
1695*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
16968eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697d71ae5a4SJacob Faibussowitsch {
1698da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1702da112707SJunchao Zhang   PetscInt                      m, nz;
1703da112707SJunchao Zhang   PetscBool                     flg;
1704da112707SJunchao Zhang 
1705da112707SJunchao Zhang   PetscFunctionBegin;
1706da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1707da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709da112707SJunchao Zhang   }
1710da112707SJunchao Zhang 
1711da112707SJunchao Zhang   /* Copy A's value to fact */
1712da112707SJunchao Zhang   m  = fact->rmap->n;
1713da112707SJunchao Zhang   nz = aij->nz;
1714da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717da112707SJunchao Zhang 
1718da112707SJunchao Zhang   /* Factorize fact inplace */
17199371c9d4SSatish Balay   if (m)
17209371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1721d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1722da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1723da112707SJunchao Zhang     int              numerical_zero;
1724da112707SJunchao Zhang     cusparseStatus_t status;
1725da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1726da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1727da112707SJunchao Zhang   }
1728da112707SJunchao Zhang 
172912ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
173012ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
173112ba2bc6SJunchao Zhang   */
17329371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1733da112707SJunchao Zhang 
17349371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1735da112707SJunchao Zhang 
173612ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
173712ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
173812ba2bc6SJunchao Zhang 
1739da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1740d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1741d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1742da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1743da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1744da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1746da112707SJunchao Zhang }
1747da112707SJunchao Zhang 
17488eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1749d71ae5a4SJacob Faibussowitsch {
1750da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1751da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1752da112707SJunchao Zhang   PetscInt                      m, nz;
1753da112707SJunchao Zhang 
1754da112707SJunchao Zhang   PetscFunctionBegin;
1755da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1756da112707SJunchao Zhang     PetscInt  i;
1757da112707SJunchao Zhang     PetscBool flg, missing;
1758da112707SJunchao Zhang 
1759da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1760da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1761da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1762da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1763da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1764da112707SJunchao Zhang   }
1765da112707SJunchao Zhang 
1766da112707SJunchao Zhang   /* Free the old stale stuff */
1767da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1768da112707SJunchao Zhang 
1769da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1770da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1771da112707SJunchao Zhang    */
1772da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1773da112707SJunchao Zhang 
1774da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1775da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1776da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1777da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1778da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1779da112707SJunchao Zhang 
1780da112707SJunchao Zhang   aij->row = NULL;
1781da112707SJunchao Zhang   aij->col = NULL;
1782da112707SJunchao Zhang 
1783da112707SJunchao Zhang   /* ====================================================================== */
1784da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1785da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1786da112707SJunchao Zhang   /* ====================================================================== */
1787da112707SJunchao Zhang   const int *Ai, *Aj;
1788da112707SJunchao Zhang 
1789da112707SJunchao Zhang   m  = fact->rmap->n;
1790da112707SJunchao Zhang   nz = aij->nz;
1791da112707SJunchao Zhang 
1792d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
1793d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
1794d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*(fs->csrVal)) * nz));
1795d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1796d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1797d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1798da112707SJunchao Zhang 
1799da112707SJunchao Zhang   /* ====================================================================== */
1800da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1801da112707SJunchao Zhang   /* ====================================================================== */
1802da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1803da112707SJunchao Zhang   cusparseDiagType_t diagType;
1804da112707SJunchao Zhang 
1805da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1806da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1807da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1808da112707SJunchao Zhang 
1809da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1810da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1811da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1812da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1813da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1814da112707SJunchao Zhang   */
1815da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1816da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1817d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18189371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18199371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1820da112707SJunchao Zhang 
1821da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1822da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1823d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18249371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1826da112707SJunchao Zhang 
1827da112707SJunchao Zhang   /* ========================================================================= */
1828da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1829da112707SJunchao Zhang   /* ========================================================================= */
1830da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18319371c9d4SSatish Balay   if (m)
18329371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1833d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1834da112707SJunchao Zhang 
1835da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1836da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1837da112707SJunchao Zhang 
1838da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1839da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1840da112707SJunchao Zhang 
1841da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18429371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1843da112707SJunchao Zhang 
1844da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18459371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1846da112707SJunchao Zhang 
1847da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
184812ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
184912ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
185012ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1851da112707SJunchao Zhang    */
185212ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
185312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
185412ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1855da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
185612ba2bc6SJunchao Zhang   } else {
185712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
185812ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1859da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
186012ba2bc6SJunchao Zhang   }
1861da112707SJunchao Zhang 
1862da112707SJunchao Zhang   /* ========================================================================== */
1863da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1864da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1865da112707SJunchao Zhang   /* ========================================================================== */
1866da112707SJunchao Zhang   int              structural_zero;
1867da112707SJunchao Zhang   cusparseStatus_t status;
1868da112707SJunchao Zhang 
1869da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18709371c9d4SSatish Balay   if (m)
18719371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1872d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1873da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1874da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1875da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1876da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1877da112707SJunchao Zhang   }
1878da112707SJunchao Zhang 
1879da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18800dd8c0acSJunchao Zhang   {
1881da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18820dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1883da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1884da112707SJunchao Zhang 
1885da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1886da112707SJunchao Zhang     Ai    = Aseq->i;
1887da112707SJunchao Zhang     Adiag = Aseq->diag;
1888da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1889da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1890da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1891da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1892da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1893da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1894da112707SJunchao Zhang         */
1895da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1896da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1897da112707SJunchao Zhang       }
1898da112707SJunchao Zhang     }
1899da112707SJunchao Zhang     fs->numericFactFlops = flops;
19000dd8c0acSJunchao Zhang   }
1901da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1903da112707SJunchao Zhang }
1904da112707SJunchao Zhang 
1905d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1906d71ae5a4SJacob Faibussowitsch {
1907da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1908da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1909da112707SJunchao Zhang   const PetscScalar            *barray;
1910da112707SJunchao Zhang   PetscScalar                  *xarray;
1911da112707SJunchao Zhang 
1912da112707SJunchao Zhang   PetscFunctionBegin;
1913da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1914da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1915da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1916da112707SJunchao Zhang 
1917da112707SJunchao Zhang   /* Solve L*y = b */
1918da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1919da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19209371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19219371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1922da112707SJunchao Zhang 
1923da112707SJunchao Zhang   /* Solve Lt*x = y */
1924da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19269371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1927da112707SJunchao Zhang 
1928da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1929da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1930da112707SJunchao Zhang 
1931da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1932da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1934da112707SJunchao Zhang }
1935da112707SJunchao Zhang 
19368eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1937d71ae5a4SJacob Faibussowitsch {
1938da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1939da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1940da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1941da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1942da112707SJunchao Zhang   PetscInt                      m, nz;
1943da112707SJunchao Zhang   PetscBool                     flg;
1944da112707SJunchao Zhang 
1945da112707SJunchao Zhang   PetscFunctionBegin;
1946da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1947da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1948da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1949da112707SJunchao Zhang   }
1950da112707SJunchao Zhang 
1951da112707SJunchao Zhang   /* Copy A's value to fact */
1952da112707SJunchao Zhang   m  = fact->rmap->n;
1953da112707SJunchao Zhang   nz = aij->nz;
1954da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1955da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1956da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1957da112707SJunchao Zhang 
1958da112707SJunchao Zhang   /* Factorize fact inplace */
1959da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1960da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1961da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1962da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1963da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1964da112707SJunchao Zhang    */
1965d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1966da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1967da112707SJunchao Zhang     int              numerical_zero;
1968da112707SJunchao Zhang     cusparseStatus_t status;
1969da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1970da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1971da112707SJunchao Zhang   }
1972da112707SJunchao Zhang 
19739371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1974da112707SJunchao Zhang 
1975da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1976da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1977da112707SJunchao Zhang   */
19789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1979da112707SJunchao Zhang 
1980da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1981da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1982da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1983da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1984da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1985da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
19863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1987da112707SJunchao Zhang }
1988da112707SJunchao Zhang 
19898eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1990d71ae5a4SJacob Faibussowitsch {
1991da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1992da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1993da112707SJunchao Zhang   PetscInt                      m, nz;
1994da112707SJunchao Zhang 
1995da112707SJunchao Zhang   PetscFunctionBegin;
1996da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1997da112707SJunchao Zhang     PetscInt  i;
1998da112707SJunchao Zhang     PetscBool flg, missing;
1999da112707SJunchao Zhang 
2000da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2001da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2002da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2003da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
2004da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2005da112707SJunchao Zhang   }
2006da112707SJunchao Zhang 
2007da112707SJunchao Zhang   /* Free the old stale stuff */
2008da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2009da112707SJunchao Zhang 
2010da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2011da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2012da112707SJunchao Zhang    */
2013da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2014da112707SJunchao Zhang 
2015da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2016da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2017da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2018da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2019da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2020da112707SJunchao Zhang 
2021da112707SJunchao Zhang   aij->row = NULL;
2022da112707SJunchao Zhang   aij->col = NULL;
2023da112707SJunchao Zhang 
2024da112707SJunchao Zhang   /* ====================================================================== */
2025da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2026da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2027da112707SJunchao Zhang   /* ====================================================================== */
2028da112707SJunchao Zhang   const int *Ai, *Aj;
2029da112707SJunchao Zhang 
2030da112707SJunchao Zhang   m  = fact->rmap->n;
2031da112707SJunchao Zhang   nz = aij->nz;
2032da112707SJunchao Zhang 
2033d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*(fs->csrRowPtr32)) * (m + 1)));
2034d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*(fs->csrColIdx32)) * nz));
2035da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2036da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2037d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2038d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2039da112707SJunchao Zhang 
2040da112707SJunchao Zhang   /* ====================================================================== */
2041da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2042da112707SJunchao Zhang   /* ====================================================================== */
2043da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2044da112707SJunchao Zhang   cusparseDiagType_t diagType;
2045da112707SJunchao Zhang 
2046da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2047da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2048da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2049da112707SJunchao Zhang 
2050da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2051da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2052da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2053da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2054da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2055da112707SJunchao Zhang   */
2056da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2057da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2058d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20599371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20609371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2061da112707SJunchao Zhang 
2062da112707SJunchao Zhang   /* ========================================================================= */
2063da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2064da112707SJunchao Zhang   /* ========================================================================= */
2065da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2066d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2067da112707SJunchao Zhang 
2068da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2069da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2070da112707SJunchao Zhang 
2071da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2072da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2073da112707SJunchao Zhang 
2074da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
20759371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2076da112707SJunchao Zhang 
2077da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
20789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2079da112707SJunchao Zhang 
208012ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
208112ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
208212ba2bc6SJunchao Zhang    */
208312ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
208412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
208512ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2086da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
208712ba2bc6SJunchao Zhang   } else {
208812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
208912ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
209012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
209112ba2bc6SJunchao Zhang   }
2092da112707SJunchao Zhang 
2093da112707SJunchao Zhang   /* ========================================================================== */
2094da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2095da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2096da112707SJunchao Zhang   /* ========================================================================== */
2097da112707SJunchao Zhang   int              structural_zero;
2098da112707SJunchao Zhang   cusparseStatus_t status;
2099da112707SJunchao Zhang 
2100da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2101d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2102da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2103da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2104da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2105da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2106da112707SJunchao Zhang   }
2107da112707SJunchao Zhang 
2108da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21090dd8c0acSJunchao Zhang   {
2110da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21110dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2112da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2113da112707SJunchao Zhang 
2114da112707SJunchao Zhang     Ai = Aseq->i;
2115da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2116da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2117da112707SJunchao Zhang       if (nzRow > 1) {
2118da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2119da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2120da112707SJunchao Zhang         */
2121da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2122da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2123da112707SJunchao Zhang       }
2124da112707SJunchao Zhang     }
2125da112707SJunchao Zhang     fs->numericFactFlops = flops;
21260dd8c0acSJunchao Zhang   }
2127da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2129da112707SJunchao Zhang }
2130da112707SJunchao Zhang #endif
2131da112707SJunchao Zhang 
2132d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2133d460d7bfSJunchao Zhang {
2134d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(B->spptr);
2135d460d7bfSJunchao Zhang 
2136d460d7bfSJunchao Zhang   PetscFunctionBegin;
2137d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2138d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2139d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2140d460d7bfSJunchao Zhang 
2141d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2142*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2143d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2144d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2145d460d7bfSJunchao Zhang #else
2146d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2147d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2148d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2149d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2150d460d7bfSJunchao Zhang 
2151d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2152d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2153d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2154d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2155d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2156d460d7bfSJunchao Zhang     } else {
2157d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2158d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2159d460d7bfSJunchao Zhang     }
2160d460d7bfSJunchao Zhang #endif
2161d460d7bfSJunchao Zhang   }
2162d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2163d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2164d460d7bfSJunchao Zhang 
2165d460d7bfSJunchao Zhang   /* get the triangular factors */
2166d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2167d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2168d460d7bfSJunchao Zhang }
2169d460d7bfSJunchao Zhang 
2170d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2171d460d7bfSJunchao Zhang {
2172d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2173d460d7bfSJunchao Zhang 
2174d460d7bfSJunchao Zhang   PetscFunctionBegin;
2175d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2176d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2177d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2178d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2179d460d7bfSJunchao Zhang }
2180d460d7bfSJunchao Zhang 
2181d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2182d71ae5a4SJacob Faibussowitsch {
2183da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2184da112707SJunchao Zhang 
2185da112707SJunchao Zhang   PetscFunctionBegin;
2186*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2187bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2188bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
2189da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2190da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2191bc996fdcSJunchao Zhang   }
2192da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2193da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2194da112707SJunchao Zhang   } else
2195da112707SJunchao Zhang #endif
2196da112707SJunchao Zhang   {
2197da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2198da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2199da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2200da112707SJunchao Zhang   }
22013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2202da112707SJunchao Zhang }
2203da112707SJunchao Zhang 
2204d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2205d71ae5a4SJacob Faibussowitsch {
2206da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2207da112707SJunchao Zhang 
2208da112707SJunchao Zhang   PetscFunctionBegin;
2209*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2210bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2211bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2212da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2213da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2214da112707SJunchao Zhang   } else
2215da112707SJunchao Zhang #endif
2216da112707SJunchao Zhang   {
2217da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2218da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2219da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2220da112707SJunchao Zhang   }
22213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2222da112707SJunchao Zhang }
2223da112707SJunchao Zhang 
2224d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2225d71ae5a4SJacob Faibussowitsch {
2226da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2227da112707SJunchao Zhang 
2228da112707SJunchao Zhang   PetscFunctionBegin;
2229da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2230da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2231da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2233da112707SJunchao Zhang }
2234da112707SJunchao Zhang 
22358eb1d50fSPierre Jolivet PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2236d71ae5a4SJacob Faibussowitsch {
2237841d4cb1SJunchao Zhang   PetscFunctionBegin;
2238841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2240841d4cb1SJunchao Zhang }
2241841d4cb1SJunchao Zhang 
2242841d4cb1SJunchao Zhang /*MC
2243841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
224411a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2245841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2246841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
224711a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2248841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2249841d4cb1SJunchao Zhang 
2250841d4cb1SJunchao Zhang   Level: beginner
2251841d4cb1SJunchao Zhang 
22522ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22532ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2254841d4cb1SJunchao Zhang M*/
2255841d4cb1SJunchao Zhang 
2256d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2257d71ae5a4SJacob Faibussowitsch {
2258841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
2259bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
2260bc996fdcSJunchao Zhang   char     *prefix;
2261bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
2262841d4cb1SJunchao Zhang 
2263841d4cb1SJunchao Zhang   PetscFunctionBegin;
2264841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2265841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2266841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
2267841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2268841d4cb1SJunchao Zhang 
2269bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2270bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2271bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2272bc996fdcSJunchao Zhang   PetscOptionsEnd();
2273bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2274bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2275bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2276bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2277bc996fdcSJunchao Zhang 
2278841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2279841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2280841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2281841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2282841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2283841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2284841d4cb1SJunchao Zhang     } else {
2285841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2286841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2287841d4cb1SJunchao Zhang     }
2288841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2289841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2290841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2291841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2292841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2293841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2294841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2295841d4cb1SJunchao Zhang     } else {
2296841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2297841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2298841d4cb1SJunchao Zhang     }
2299841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2300841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2301841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2302841d4cb1SJunchao Zhang 
2303841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2304841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2305841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2307841d4cb1SJunchao Zhang }
2308841d4cb1SJunchao Zhang 
2309d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2310d71ae5a4SJacob Faibussowitsch {
23117e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23127e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2313*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2314da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23150dd8c0acSJunchao Zhang #endif
23167e8381f9SStefano Zampini 
23177e8381f9SStefano Zampini   PetscFunctionBegin;
23187e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23199566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2320da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2321da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23229566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2323da112707SJunchao Zhang     }
2324*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2325da112707SJunchao Zhang     else if (fs->csrVal) {
2326da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2327da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2328da112707SJunchao Zhang     }
2329da112707SJunchao Zhang #endif
23309371c9d4SSatish Balay     else
23319371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23329566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23339566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23347e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23357e8381f9SStefano Zampini   }
23363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23377e8381f9SStefano Zampini }
23387e8381f9SStefano Zampini 
2339d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2340d71ae5a4SJacob Faibussowitsch {
23417e8381f9SStefano Zampini   PetscFunctionBegin;
23429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
234367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
234567a45760SJunchao Zhang }
234667a45760SJunchao Zhang 
2347d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2348d71ae5a4SJacob Faibussowitsch {
234967a45760SJunchao Zhang   PetscFunctionBegin;
23507e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
235167a45760SJunchao Zhang   *array         = NULL;
23523ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235367a45760SJunchao Zhang }
235467a45760SJunchao Zhang 
2355d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2356d71ae5a4SJacob Faibussowitsch {
235767a45760SJunchao Zhang   PetscFunctionBegin;
23589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
235967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236167a45760SJunchao Zhang }
236267a45760SJunchao Zhang 
23638eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2364d71ae5a4SJacob Faibussowitsch {
236567a45760SJunchao Zhang   PetscFunctionBegin;
236667a45760SJunchao Zhang   *array = NULL;
23673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236867a45760SJunchao Zhang }
236967a45760SJunchao Zhang 
2370d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2371d71ae5a4SJacob Faibussowitsch {
237267a45760SJunchao Zhang   PetscFunctionBegin;
237367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237567a45760SJunchao Zhang }
237667a45760SJunchao Zhang 
2377d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2378d71ae5a4SJacob Faibussowitsch {
237967a45760SJunchao Zhang   PetscFunctionBegin;
238067a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
238167a45760SJunchao Zhang   *array         = NULL;
23823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23837e8381f9SStefano Zampini }
23847e8381f9SStefano Zampini 
2385d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2386d71ae5a4SJacob Faibussowitsch {
23877ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
23887ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
23897ee59b9bSJunchao Zhang 
23907ee59b9bSJunchao Zhang   PetscFunctionBegin;
23917ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23927ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
23937ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
23947ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
23957ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
23967ee59b9bSJunchao Zhang 
23977ee59b9bSJunchao Zhang   if (i) {
23987ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
23997ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24007ee59b9bSJunchao Zhang #else
24017ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24027ee59b9bSJunchao Zhang #endif
24037ee59b9bSJunchao Zhang   }
24047ee59b9bSJunchao Zhang   if (j) {
24057ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24067ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24077ee59b9bSJunchao Zhang #else
24087ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24097ee59b9bSJunchao Zhang #endif
24107ee59b9bSJunchao Zhang   }
24117ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24127ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24147ee59b9bSJunchao Zhang }
24157ee59b9bSJunchao Zhang 
2416d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2417d71ae5a4SJacob Faibussowitsch {
2418aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24197c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24209ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2421213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2422aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2423abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24249ae82921SPaul Mullowney 
24259ae82921SPaul Mullowney   PetscFunctionBegin;
242628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2427c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2428a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2429a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2430afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
243185ba7357SStefano Zampini 
243208401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24339566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2434afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24359566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
24369566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
24379566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24389566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
243934d6c7a5SJose E. Roman     } else {
2440abb89eb1SStefano Zampini       PetscInt nnz;
24419566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24429566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24439566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24447c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
244581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2446a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2447a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24489ae82921SPaul Mullowney       try {
24499ae82921SPaul Mullowney         if (a->compressedrow.use) {
24509ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24519ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24529ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24539ae82921SPaul Mullowney         } else {
2454213423ffSJunchao Zhang           m    = A->rmap->n;
2455213423ffSJunchao Zhang           ii   = a->i;
2456e6e9a74fSStefano Zampini           ridx = NULL;
24579ae82921SPaul Mullowney         }
245808401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24599371c9d4SSatish Balay         if (!a->a) {
24609371c9d4SSatish Balay           nnz  = ii[m];
24619371c9d4SSatish Balay           both = PETSC_FALSE;
24629371c9d4SSatish Balay         } else nnz = a->nz;
246308401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24649ae82921SPaul Mullowney 
246585ba7357SStefano Zampini         /* create cusparse matrix */
2466abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2467aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24719ae82921SPaul Mullowney 
24729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
24739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
24749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
24759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24779566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2479b06137fdSPaul Mullowney 
2480aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2481aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2482aa372e3fSPaul Mullowney           /* set the matrix */
2483afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2484afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2485afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2486abb89eb1SStefano Zampini           mat->num_entries = nnz;
2487afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2488afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
24899ae82921SPaul Mullowney 
2490abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2491abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2492aa372e3fSPaul Mullowney 
2493abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2494abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2495aa372e3fSPaul Mullowney 
2496aa372e3fSPaul Mullowney           /* assign the pointer */
2497afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2498afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2499afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25009371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25019371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25029371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2503afb2bd1cSJunchao Zhang           }
2504afb2bd1cSJunchao Zhang #endif
2505aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2506afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2507afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2508afb2bd1cSJunchao Zhang #else
2509afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2510afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2511afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2512abb89eb1SStefano Zampini           mat->num_entries = nnz;
2513afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2514afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2515aa372e3fSPaul Mullowney 
2516abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2517abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2518aa372e3fSPaul Mullowney 
2519abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2520abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2521aa372e3fSPaul Mullowney 
2522aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25239566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25249371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25259371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25269371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2527aa372e3fSPaul Mullowney           /* assign the pointer */
2528aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2529aa372e3fSPaul Mullowney 
2530afb2bd1cSJunchao Zhang           if (mat) {
2531afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2532afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2533afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2534afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2535087f3262SPaul Mullowney           }
2536afb2bd1cSJunchao Zhang #endif
2537087f3262SPaul Mullowney         }
2538ca45077fSPaul Mullowney 
2539aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2540213423ffSJunchao Zhang         if (a->compressedrow.use) {
2541213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2542aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2543aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2544213423ffSJunchao Zhang           tmp = m;
2545213423ffSJunchao Zhang         } else {
2546213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2547213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2548213423ffSJunchao Zhang           tmp                        = 0;
2549213423ffSJunchao Zhang         }
25509566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2551aa372e3fSPaul Mullowney 
2552aa372e3fSPaul Mullowney         /* assign the pointer */
2553aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2554d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2555d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2556d71ae5a4SJacob Faibussowitsch       }
25579566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25589566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
255934d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
256034d6c7a5SJose E. Roman     }
2561abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25629ae82921SPaul Mullowney   }
25633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25649ae82921SPaul Mullowney }
25659ae82921SPaul Mullowney 
25669371c9d4SSatish Balay struct VecCUDAPlusEquals {
2567aa372e3fSPaul Mullowney   template <typename Tuple>
2568d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2569d71ae5a4SJacob Faibussowitsch   {
2570aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2571aa372e3fSPaul Mullowney   }
2572aa372e3fSPaul Mullowney };
2573aa372e3fSPaul Mullowney 
25749371c9d4SSatish Balay struct VecCUDAEquals {
25757e8381f9SStefano Zampini   template <typename Tuple>
2576d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2577d71ae5a4SJacob Faibussowitsch   {
25787e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
25797e8381f9SStefano Zampini   }
25807e8381f9SStefano Zampini };
25817e8381f9SStefano Zampini 
25829371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2583e6e9a74fSStefano Zampini   template <typename Tuple>
2584d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2585d71ae5a4SJacob Faibussowitsch   {
2586e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2587e6e9a74fSStefano Zampini   }
2588e6e9a74fSStefano Zampini };
2589e6e9a74fSStefano Zampini 
2590afb2bd1cSJunchao Zhang struct MatMatCusparse {
2591ccdfe979SStefano Zampini   PetscBool      cisdense;
2592ccdfe979SStefano Zampini   PetscScalar   *Bt;
2593ccdfe979SStefano Zampini   Mat            X;
2594fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2595fcdce8c4SStefano Zampini   PetscLogDouble flops;
2596fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2597b4285af6SJunchao Zhang 
2598afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2599fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2600afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2601afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2602afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2603afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2604b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2605b4285af6SJunchao Zhang   void *dBuffer4;
2606b4285af6SJunchao Zhang   void *dBuffer5;
2607b4285af6SJunchao Zhang   #endif
2608fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2609fcdce8c4SStefano Zampini   void                 *mmBuffer;
2610fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2611fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2612afb2bd1cSJunchao Zhang #endif
2613afb2bd1cSJunchao Zhang };
2614ccdfe979SStefano Zampini 
2615d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2616d71ae5a4SJacob Faibussowitsch {
2617ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2618ccdfe979SStefano Zampini 
2619ccdfe979SStefano Zampini   PetscFunctionBegin;
26209566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2621fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2622afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26239566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26249566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26259566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26269566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2627b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26289566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26299566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2630b4285af6SJunchao Zhang   #endif
26319566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26329566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2633afb2bd1cSJunchao Zhang #endif
26349566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
26359566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
26363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2637ccdfe979SStefano Zampini }
2638ccdfe979SStefano Zampini 
26394742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2640ccdfe979SStefano Zampini 
2641d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2642d71ae5a4SJacob Faibussowitsch {
2643ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2644ccdfe979SStefano Zampini   Mat                           A, B;
2645afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2646ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2647ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2648ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2649ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2650ccdfe979SStefano Zampini   const PetscScalar            *barray;
2651ccdfe979SStefano Zampini   PetscScalar                  *carray;
2652ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2653ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2654ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2655ccdfe979SStefano Zampini 
2656ccdfe979SStefano Zampini   PetscFunctionBegin;
2657ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
265828b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2659ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2660ccdfe979SStefano Zampini   A      = product->A;
2661ccdfe979SStefano Zampini   B      = product->B;
26629566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
266328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2664ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2665ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
266628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2668ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2669ccdfe979SStefano Zampini   switch (product->type) {
2670ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2671ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2672ccdfe979SStefano Zampini     mat = cusp->mat;
2673ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2674ccdfe979SStefano Zampini     m   = A->rmap->n;
2675ccdfe979SStefano Zampini     n   = B->cmap->n;
2676ccdfe979SStefano Zampini     break;
2677ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
26781a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2679e6e9a74fSStefano Zampini       mat = cusp->mat;
2680e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2681e6e9a74fSStefano Zampini     } else {
26829566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2683ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2684ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2685e6e9a74fSStefano Zampini     }
2686ccdfe979SStefano Zampini     m = A->cmap->n;
2687ccdfe979SStefano Zampini     n = B->cmap->n;
2688ccdfe979SStefano Zampini     break;
2689ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2690ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2691ccdfe979SStefano Zampini     mat = cusp->mat;
2692ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2693ccdfe979SStefano Zampini     m   = A->rmap->n;
2694ccdfe979SStefano Zampini     n   = B->rmap->n;
2695ccdfe979SStefano Zampini     break;
2696d71ae5a4SJacob Faibussowitsch   default:
2697d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2698ccdfe979SStefano Zampini   }
269928b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2700ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2701ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27029566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27039566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2704cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2705afb2bd1cSJunchao Zhang 
27069566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2707c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2708cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27099566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2710c8378d12SStefano Zampini   } else {
2711cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27129566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2713c8378d12SStefano Zampini   }
2714c8378d12SStefano Zampini 
27159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2716afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2717afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2718a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2719afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2720fcdce8c4SStefano Zampini     size_t mmBufferSize;
27219371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27229371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27239371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27249371c9d4SSatish Balay     }
2725afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27269566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2727afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2728afb2bd1cSJunchao Zhang     }
2729c8378d12SStefano Zampini 
27309371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27319371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27329371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27339371c9d4SSatish Balay     }
2734afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27359566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2736afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2737afb2bd1cSJunchao Zhang     }
2738afb2bd1cSJunchao Zhang 
2739afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
27409371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27419371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27429371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2743afb2bd1cSJunchao Zhang     }
27449371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
27459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2746fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27479566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27489566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2749fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2750fcdce8c4SStefano Zampini     }
2751afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2752afb2bd1cSJunchao Zhang   } else {
2753afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
27549566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
27559566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
27569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2757afb2bd1cSJunchao Zhang   }
2758afb2bd1cSJunchao Zhang 
2759afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
27609371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
27619371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2762afb2bd1cSJunchao Zhang #else
2763afb2bd1cSJunchao Zhang   PetscInt k;
2764afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2765ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2766ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2767ccdfe979SStefano Zampini     cublasStatus_t cerr;
2768ccdfe979SStefano Zampini 
27699566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
27709371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
27719371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2772ccdfe979SStefano Zampini     blda = B->cmap->n;
2773afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2774afb2bd1cSJunchao Zhang   } else {
2775afb2bd1cSJunchao Zhang     k = B->rmap->n;
2776ccdfe979SStefano Zampini   }
2777ccdfe979SStefano Zampini 
2778afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
27799371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
27809371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2781afb2bd1cSJunchao Zhang #endif
27829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
27839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2784cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2785ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2786cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
27874742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2788ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2789cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
27904742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2791ccdfe979SStefano Zampini   } else {
2792cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2793ccdfe979SStefano Zampini   }
279448a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
279548a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
27963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2797ccdfe979SStefano Zampini }
2798ccdfe979SStefano Zampini 
2799d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2800d71ae5a4SJacob Faibussowitsch {
2801ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2802ccdfe979SStefano Zampini   Mat                 A, B;
2803ccdfe979SStefano Zampini   PetscInt            m, n;
2804ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2805ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2806ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2807ccdfe979SStefano Zampini 
2808ccdfe979SStefano Zampini   PetscFunctionBegin;
2809ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
281028b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2811ccdfe979SStefano Zampini   A = product->A;
2812ccdfe979SStefano Zampini   B = product->B;
28139566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
281428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2815ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
281608401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2817ccdfe979SStefano Zampini   switch (product->type) {
2818ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2819ccdfe979SStefano Zampini     m = A->rmap->n;
2820ccdfe979SStefano Zampini     n = B->cmap->n;
2821ccdfe979SStefano Zampini     break;
2822ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2823ccdfe979SStefano Zampini     m = A->cmap->n;
2824ccdfe979SStefano Zampini     n = B->cmap->n;
2825ccdfe979SStefano Zampini     break;
2826ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2827ccdfe979SStefano Zampini     m = A->rmap->n;
2828ccdfe979SStefano Zampini     n = B->rmap->n;
2829ccdfe979SStefano Zampini     break;
2830ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2831ccdfe979SStefano Zampini     m = B->cmap->n;
2832ccdfe979SStefano Zampini     n = B->cmap->n;
2833ccdfe979SStefano Zampini     break;
2834ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2835ccdfe979SStefano Zampini     m = B->rmap->n;
2836ccdfe979SStefano Zampini     n = B->rmap->n;
2837ccdfe979SStefano Zampini     break;
2838d71ae5a4SJacob Faibussowitsch   default:
2839d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2840ccdfe979SStefano Zampini   }
28419566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2842ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28439566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28449566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2845ccdfe979SStefano Zampini 
2846ccdfe979SStefano Zampini   /* product data */
28479566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2848ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2849afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2850afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
285148a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2852afb2bd1cSJunchao Zhang #endif
2853ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2854ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28559566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
28569566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2857ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
28589566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2859ccdfe979SStefano Zampini     } else {
28609566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2861ccdfe979SStefano Zampini     }
2862ccdfe979SStefano Zampini   }
2863ccdfe979SStefano Zampini   C->product->data    = mmdata;
2864ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2865ccdfe979SStefano Zampini 
2866ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
28673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2868ccdfe979SStefano Zampini }
2869ccdfe979SStefano Zampini 
2870d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2871d71ae5a4SJacob Faibussowitsch {
2872ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2873fcdce8c4SStefano Zampini   Mat                           A, B;
2874fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2875fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2876fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2877fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2878fcdce8c4SStefano Zampini   PetscBool                     flg;
2879fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2880fcdce8c4SStefano Zampini   MatProductType                ptype;
2881fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2882fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2883fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2884fcdce8c4SStefano Zampini #endif
2885b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2886ccdfe979SStefano Zampini 
2887ccdfe979SStefano Zampini   PetscFunctionBegin;
2888ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
288928b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
28909566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
289128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2892fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2893fcdce8c4SStefano Zampini   A      = product->A;
2894fcdce8c4SStefano Zampini   B      = product->B;
2895fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2896fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2897fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
289808401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2899fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
290028b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2901fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
290228b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2903fcdce8c4SStefano Zampini     goto finalize;
2904fcdce8c4SStefano Zampini   }
2905fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29069566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
290728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29089566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
290928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
291028b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
291128b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2912fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2913fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2914fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
291508401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
291608401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
291708401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2920fcdce8c4SStefano Zampini 
2921fcdce8c4SStefano Zampini   ptype = product->type;
2922b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2923fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
292428b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2925fa046f9fSJunchao Zhang   }
2926b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2927fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
292828b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2929fa046f9fSJunchao Zhang   }
2930fcdce8c4SStefano Zampini   switch (ptype) {
2931fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2932fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2933fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2934fcdce8c4SStefano Zampini     break;
2935fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2936fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2937fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2938fcdce8c4SStefano Zampini     break;
2939fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2940fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2941fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2942fcdce8c4SStefano Zampini     break;
2943d71ae5a4SJacob Faibussowitsch   default:
2944d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2945fcdce8c4SStefano Zampini   }
2946fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
294728b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
294828b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
294928b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2950fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2951fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2952fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
295328b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
295428b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
295528b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
29569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2957fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2958fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
29599566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2960b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
29619371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29629371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2963b4285af6SJunchao Zhang   #else
29649371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29659371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29669371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29679371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2968b4285af6SJunchao Zhang   #endif
2969fcdce8c4SStefano Zampini #else
29709371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29719371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
29729371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2973fcdce8c4SStefano Zampini #endif
29749566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
29759566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
29769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2977fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2978fcdce8c4SStefano Zampini finalize:
2979fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
29809566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
29819566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
29829566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2983fcdce8c4SStefano Zampini   c->reallocs = 0;
2984fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2985fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2986fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2987fcdce8c4SStefano Zampini   C->num_ass++;
29883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2989ccdfe979SStefano Zampini }
2990fcdce8c4SStefano Zampini 
2991d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2992d71ae5a4SJacob Faibussowitsch {
2993fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2994fcdce8c4SStefano Zampini   Mat                           A, B;
2995fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2996fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2997fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2998fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2999fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3000fcdce8c4SStefano Zampini   PetscBool                     flg;
3001fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3002fcdce8c4SStefano Zampini   MatProductType                ptype;
3003fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3004fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3005fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3006fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3007fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3008fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3009fcdce8c4SStefano Zampini #else
3010fcdce8c4SStefano Zampini   int cnz;
3011fcdce8c4SStefano Zampini #endif
3012b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3013fcdce8c4SStefano Zampini 
3014fcdce8c4SStefano Zampini   PetscFunctionBegin;
3015fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
301628b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3017fcdce8c4SStefano Zampini   A = product->A;
3018fcdce8c4SStefano Zampini   B = product->B;
30199566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
302028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30219566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
302228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3023fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3024fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3025fcdce8c4SStefano Zampini   /* product data */
30269566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3027fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3028fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3029fcdce8c4SStefano Zampini 
30309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3032d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3033d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
303408401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
303508401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3036d60bce21SJunchao Zhang 
3037fcdce8c4SStefano Zampini   ptype = product->type;
3038b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3039fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3040fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3041fa046f9fSJunchao Zhang   }
3042b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3043fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3044fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3045fa046f9fSJunchao Zhang   }
3046fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3047fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3048fcdce8c4SStefano Zampini   switch (ptype) {
3049fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3050fcdce8c4SStefano Zampini     m    = A->rmap->n;
3051fcdce8c4SStefano Zampini     n    = B->cmap->n;
3052fcdce8c4SStefano Zampini     k    = A->cmap->n;
3053fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3054fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3055fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3056fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3057fcdce8c4SStefano Zampini     break;
3058fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3059fcdce8c4SStefano Zampini     m = A->cmap->n;
3060fcdce8c4SStefano Zampini     n = B->cmap->n;
3061fcdce8c4SStefano Zampini     k = A->rmap->n;
30629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3063fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3064fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3065fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3066fcdce8c4SStefano Zampini     break;
3067fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3068fcdce8c4SStefano Zampini     m = A->rmap->n;
3069fcdce8c4SStefano Zampini     n = B->rmap->n;
3070fcdce8c4SStefano Zampini     k = A->cmap->n;
30719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3072fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3073fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3074fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3075fcdce8c4SStefano Zampini     break;
3076d71ae5a4SJacob Faibussowitsch   default:
3077d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3078fcdce8c4SStefano Zampini   }
3079fcdce8c4SStefano Zampini 
3080fcdce8c4SStefano Zampini   /* create cusparse matrix */
30819566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
30829566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3083fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3084fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3085fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3086fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3087fcdce8c4SStefano Zampini 
3088fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3089fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3090fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
30919566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
30929566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3093fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3094fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3095fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3096fcdce8c4SStefano Zampini   } else {
3097fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3098fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3099fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3100fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3101fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3102fcdce8c4SStefano Zampini   }
3103fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3104fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3105fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3106fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3107fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3108fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31099566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31109566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31119566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
31129566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
31139566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
31149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
31159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31179566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3118fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3119d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3120fcdce8c4SStefano Zampini     c->nz                = 0;
3121fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3122fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3123fcdce8c4SStefano Zampini     goto finalizesym;
3124fcdce8c4SStefano Zampini   }
3125fcdce8c4SStefano Zampini 
312628b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
312728b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3128fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3129fcdce8c4SStefano Zampini   if (!biscompressed) {
3130fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3131fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3132fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3133fcdce8c4SStefano Zampini #endif
3134fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3135fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3136fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3137fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3138fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3139fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3140fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3141fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3142fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3143fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3144fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31459566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3146fcdce8c4SStefano Zampini     }
3147fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3148fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3149fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3150fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
31519371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31529371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3153fcdce8c4SStefano Zampini     }
3154fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3155fcdce8c4SStefano Zampini #endif
3156fcdce8c4SStefano Zampini   }
315728b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
315828b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3159fcdce8c4SStefano Zampini   /* precompute flops count */
3160fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3161fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3162fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3163fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3164fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3165fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3166fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3167fcdce8c4SStefano Zampini       }
3168fcdce8c4SStefano Zampini     }
3169fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3170fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3171fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3172fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3173fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3174fcdce8c4SStefano Zampini     }
3175fcdce8c4SStefano Zampini   } else { /* TODO */
3176fcdce8c4SStefano Zampini     flops = 0.;
3177fcdce8c4SStefano Zampini   }
3178fcdce8c4SStefano Zampini 
3179fcdce8c4SStefano Zampini   mmdata->flops = flops;
31809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3181b4285af6SJunchao Zhang 
3182fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
31839566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
31849371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31859371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
31869566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3187b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3188b4285af6SJunchao Zhang   {
3189b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3190b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3191b4285af6SJunchao Zhang   */
3192b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3193b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3194b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3195b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3196b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3197b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3198b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3199b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3200b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3201b4285af6SJunchao Zhang 
3202b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32039371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32049371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32059566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3206b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32079371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32089371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3209b4285af6SJunchao Zhang 
32109371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32119371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32139566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32159371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32169371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3219b4285af6SJunchao Zhang 
3220b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32219566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3222b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3223b4285af6SJunchao Zhang     /* allocate matrix C */
32249371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32259371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32269371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32279371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3228b4285af6SJunchao Zhang     /* update matC with the new pointers */
32299371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32309371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3231b4285af6SJunchao Zhang 
32329371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32339371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32359371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32369371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32389371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32399371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32409566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3241b4285af6SJunchao Zhang   }
3242ae37ee31SJunchao Zhang   #else
3243b4285af6SJunchao Zhang   size_t bufSize2;
3244fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
32459371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
32469371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3248fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
32499371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
32509371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3251fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
32529371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
32539371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3254fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3255fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3256fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3257fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3258fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
32599566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3260fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
32619371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
32629371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3263fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
32649566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3265fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
32669371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
32679371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3268fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32699566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3270fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
32719566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32729371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32739371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32749371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32759371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3276ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3277fcdce8c4SStefano Zampini #else
32789566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
32799371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
32809371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
32819371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3282fcdce8c4SStefano Zampini   c->nz                = cnz;
3283fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32849566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3285fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
32869566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3287fcdce8c4SStefano Zampini 
32889566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3289fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3290fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3291fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
32929371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
32939371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
32949371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3295fcdce8c4SStefano Zampini #endif
32969566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
32979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3298fcdce8c4SStefano Zampini finalizesym:
3299fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3300fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3301fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
33029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
33039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3304fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3305fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3306fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3307fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3308fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3309fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3310fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3313fcdce8c4SStefano Zampini   } else {
3314fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3315fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3318fcdce8c4SStefano Zampini   }
3319fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3320fcdce8c4SStefano Zampini     PetscInt r = 0;
3321fcdce8c4SStefano Zampini     c->i[0]    = 0;
3322fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3323fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3324fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3325fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3326fcdce8c4SStefano Zampini     }
3327fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3328fcdce8c4SStefano Zampini   }
33299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33309566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33319566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3332fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3333fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3334fcdce8c4SStefano Zampini   c->rmax          = 0;
3335fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3336fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3337fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3338fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3339fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3340fcdce8c4SStefano Zampini   }
33419566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
33429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3343fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3344fcdce8c4SStefano Zampini 
3345fcdce8c4SStefano Zampini   C->nonzerostate++;
33469566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
33479566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3348fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3349fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3350fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3351fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3352fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3353abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3354fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3355fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3356fcdce8c4SStefano Zampini   }
3357fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
33583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3359fcdce8c4SStefano Zampini }
3360fcdce8c4SStefano Zampini 
3361fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3362fcdce8c4SStefano Zampini 
3363fcdce8c4SStefano Zampini /* handles sparse or dense B */
3364d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3365d71ae5a4SJacob Faibussowitsch {
3366fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3367fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3368fcdce8c4SStefano Zampini 
3369fcdce8c4SStefano Zampini   PetscFunctionBegin;
3370fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
33719566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
337248a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3373fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3374fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
337548a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3376fcdce8c4SStefano Zampini   }
337765e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
337865e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
337965e4b4d4SStefano Zampini     switch (product->type) {
338065e4b4d4SStefano Zampini     case MATPRODUCT_AB:
338165e4b4d4SStefano Zampini       if (product->api_user) {
3382d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
33839566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3384d0609cedSBarry Smith         PetscOptionsEnd();
338565e4b4d4SStefano Zampini       } else {
3386d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
33879566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3388d0609cedSBarry Smith         PetscOptionsEnd();
338965e4b4d4SStefano Zampini       }
339065e4b4d4SStefano Zampini       break;
339165e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
339265e4b4d4SStefano Zampini       if (product->api_user) {
3393d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
33949566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3395d0609cedSBarry Smith         PetscOptionsEnd();
339665e4b4d4SStefano Zampini       } else {
3397d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
33989566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3399d0609cedSBarry Smith         PetscOptionsEnd();
340065e4b4d4SStefano Zampini       }
340165e4b4d4SStefano Zampini       break;
340265e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
340365e4b4d4SStefano Zampini       if (product->api_user) {
3404d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34059566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3406d0609cedSBarry Smith         PetscOptionsEnd();
340765e4b4d4SStefano Zampini       } else {
3408d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34099566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3410d0609cedSBarry Smith         PetscOptionsEnd();
341165e4b4d4SStefano Zampini       }
341265e4b4d4SStefano Zampini       break;
341365e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
341465e4b4d4SStefano Zampini       if (product->api_user) {
3415d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34169566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3417d0609cedSBarry Smith         PetscOptionsEnd();
341865e4b4d4SStefano Zampini       } else {
3419d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34209566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3421d0609cedSBarry Smith         PetscOptionsEnd();
342265e4b4d4SStefano Zampini       }
342365e4b4d4SStefano Zampini       break;
342465e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
342565e4b4d4SStefano Zampini       if (product->api_user) {
3426d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34279566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3428d0609cedSBarry Smith         PetscOptionsEnd();
342965e4b4d4SStefano Zampini       } else {
3430d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34319566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3432d0609cedSBarry Smith         PetscOptionsEnd();
343365e4b4d4SStefano Zampini       }
343465e4b4d4SStefano Zampini       break;
3435d71ae5a4SJacob Faibussowitsch     default:
3436d71ae5a4SJacob Faibussowitsch       break;
343765e4b4d4SStefano Zampini     }
343865e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
343965e4b4d4SStefano Zampini   }
344065e4b4d4SStefano Zampini   /* dispatch */
3441fcdce8c4SStefano Zampini   if (isdense) {
3442ccdfe979SStefano Zampini     switch (product->type) {
3443ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3444ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3445ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3446ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3447ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3448fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
34499566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3450fcdce8c4SStefano Zampini       } else {
3451fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3452fcdce8c4SStefano Zampini       }
3453fcdce8c4SStefano Zampini       break;
3454d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3455d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3456d71ae5a4SJacob Faibussowitsch       break;
3457d71ae5a4SJacob Faibussowitsch     default:
3458d71ae5a4SJacob Faibussowitsch       break;
3459ccdfe979SStefano Zampini     }
3460fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3461fcdce8c4SStefano Zampini     switch (product->type) {
3462fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3463fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3464d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3465d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3466d71ae5a4SJacob Faibussowitsch       break;
3467fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3468fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3469d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3470d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3471d71ae5a4SJacob Faibussowitsch       break;
3472d71ae5a4SJacob Faibussowitsch     default:
3473d71ae5a4SJacob Faibussowitsch       break;
3474fcdce8c4SStefano Zampini     }
3475fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
34769566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3477fcdce8c4SStefano Zampini   }
34783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3479ccdfe979SStefano Zampini }
3480ccdfe979SStefano Zampini 
3481d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3482d71ae5a4SJacob Faibussowitsch {
34839ae82921SPaul Mullowney   PetscFunctionBegin;
34849566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
34853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3486e6e9a74fSStefano Zampini }
3487e6e9a74fSStefano Zampini 
3488d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3489d71ae5a4SJacob Faibussowitsch {
3490e6e9a74fSStefano Zampini   PetscFunctionBegin;
34919566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
34923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3493e6e9a74fSStefano Zampini }
3494e6e9a74fSStefano Zampini 
3495d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3496d71ae5a4SJacob Faibussowitsch {
3497e6e9a74fSStefano Zampini   PetscFunctionBegin;
34989566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
34993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3500e6e9a74fSStefano Zampini }
3501e6e9a74fSStefano Zampini 
3502d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3503d71ae5a4SJacob Faibussowitsch {
3504e6e9a74fSStefano Zampini   PetscFunctionBegin;
35059566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35079ae82921SPaul Mullowney }
35089ae82921SPaul Mullowney 
3509d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3510d71ae5a4SJacob Faibussowitsch {
3511ca45077fSPaul Mullowney   PetscFunctionBegin;
35129566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3514ca45077fSPaul Mullowney }
3515ca45077fSPaul Mullowney 
3516d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3517d71ae5a4SJacob Faibussowitsch {
3518a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3519a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3520a0e72f99SJunchao Zhang }
3521a0e72f99SJunchao Zhang 
3522afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3523d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3524d71ae5a4SJacob Faibussowitsch {
35259ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3526aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35279ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3528e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3529e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3530e6e9a74fSStefano Zampini   PetscBool                     compressed;
3531afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3532afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3533afb2bd1cSJunchao Zhang #endif
35346e111a19SKarl Rupp 
35359ae82921SPaul Mullowney   PetscFunctionBegin;
353608401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3537cbc6b225SStefano Zampini   if (!a->nz) {
3538995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3539995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35403ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3541e6e9a74fSStefano Zampini   }
354234d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3544e6e9a74fSStefano Zampini   if (!trans) {
35459ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
35465f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3547e6e9a74fSStefano Zampini   } else {
35481a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3549e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3550e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3551e6e9a74fSStefano Zampini     } else {
35529566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3553e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3554e6e9a74fSStefano Zampini     }
3555e6e9a74fSStefano Zampini   }
3556e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3557e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3558213423ffSJunchao Zhang 
3559e6e9a74fSStefano Zampini   try {
35609566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
356169d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
35629566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3563afb2bd1cSJunchao Zhang 
35649566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3565e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3566afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3567afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3568afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3569afb2bd1cSJunchao Zhang       */
3570e6e9a74fSStefano Zampini       xptr = xarray;
3571afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3572213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3573afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3574afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3575afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3576afb2bd1cSJunchao Zhang        */
3577afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3578afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3579afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3580afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3581afb2bd1cSJunchao Zhang       }
3582afb2bd1cSJunchao Zhang #endif
3583e6e9a74fSStefano Zampini     } else {
3584afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3585afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3586afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3587afb2bd1cSJunchao Zhang        */
3588afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3589e6e9a74fSStefano Zampini       dptr = zarray;
3590e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3591afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3592e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3593d0967f54SJacob Faibussowitsch 
3594d0967f54SJacob Faibussowitsch         thrust::for_each(
3595d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3596d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3597d0967f54SJacob Faibussowitsch #endif
3598d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
35999371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3600e6e9a74fSStefano Zampini       }
3601afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3602afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3603afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3604afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3605afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3606afb2bd1cSJunchao Zhang       }
3607afb2bd1cSJunchao Zhang #endif
3608e6e9a74fSStefano Zampini     }
36099ae82921SPaul Mullowney 
3610afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3611aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3612afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
36135f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3614afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36159566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36169566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36179371c9d4SSatish Balay         PetscCallCUSPARSE(
36189371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36199566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3620afb2bd1cSJunchao Zhang 
3621afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3622afb2bd1cSJunchao Zhang       } else {
3623afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3626afb2bd1cSJunchao Zhang       }
3627afb2bd1cSJunchao Zhang 
36289371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
36299371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3630afb2bd1cSJunchao Zhang #else
36317656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
36329371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3633afb2bd1cSJunchao Zhang #endif
3634aa372e3fSPaul Mullowney     } else {
3635213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3636afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3637afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3638afb2bd1cSJunchao Zhang #else
3639301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
36409371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3641afb2bd1cSJunchao Zhang #endif
3642a65300a6SPaul Mullowney       }
3643aa372e3fSPaul Mullowney     }
36449566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3645aa372e3fSPaul Mullowney 
3646e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3647213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3648213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3649995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3650e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3651995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
36527656d835SStefano Zampini         }
3653213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3654995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
36557656d835SStefano Zampini       }
36567656d835SStefano Zampini 
3657213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3658213423ffSJunchao Zhang       if (compressed) {
36599566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3660da81f932SPierre Jolivet         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3661a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3662a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3663a0e72f99SJunchao Zhang          */
3664a0e72f99SJunchao Zhang #if 0
3665a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3666a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3667a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3668e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3669c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3670a0e72f99SJunchao Zhang #else
3671a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3672a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3673a0e72f99SJunchao Zhang #endif
36749566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3675e6e9a74fSStefano Zampini       }
3676e6e9a74fSStefano Zampini     } else {
3677995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3678e6e9a74fSStefano Zampini     }
36799566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
36809566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
36819566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3682d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3683d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3684d71ae5a4SJacob Faibussowitsch   }
3685e6e9a74fSStefano Zampini   if (yy) {
36869566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3687e6e9a74fSStefano Zampini   } else {
36889566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3689e6e9a74fSStefano Zampini   }
36903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
36919ae82921SPaul Mullowney }
36929ae82921SPaul Mullowney 
3693d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3694d71ae5a4SJacob Faibussowitsch {
3695ca45077fSPaul Mullowney   PetscFunctionBegin;
36969566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
36973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3698ca45077fSPaul Mullowney }
3699ca45077fSPaul Mullowney 
3700d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3701d71ae5a4SJacob Faibussowitsch {
3702042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3703042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
37043fa6b06aSMark Adams 
3705042217e8SBarry Smith   PetscFunctionBegin;
37069566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3707042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
37089566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
37099566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3710042217e8SBarry Smith     cusp->deviceMat = NULL;
3711042217e8SBarry Smith   }
37123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37139ae82921SPaul Mullowney }
37149ae82921SPaul Mullowney 
3715e057df02SPaul Mullowney /*@
371611a5261eSBarry Smith    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3717e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
371811a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3719e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
372020f4b53cSBarry Smith    the parameter `nz` (or the array `nnz`).
37219ae82921SPaul Mullowney 
3722d083f849SBarry Smith    Collective
37239ae82921SPaul Mullowney 
37249ae82921SPaul Mullowney    Input Parameters:
372511a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
37269ae82921SPaul Mullowney .  m - number of rows
37279ae82921SPaul Mullowney .  n - number of columns
372820f4b53cSBarry Smith .  nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
372920f4b53cSBarry Smith -  nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37309ae82921SPaul Mullowney 
37319ae82921SPaul Mullowney    Output Parameter:
37329ae82921SPaul Mullowney .  A - the matrix
37339ae82921SPaul Mullowney 
37342ef1f0ffSBarry Smith    Level: intermediate
37352ef1f0ffSBarry Smith 
37362ef1f0ffSBarry Smith    Notes:
373711a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
37389ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
373911a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
37409ae82921SPaul Mullowney 
374111a5261eSBarry Smith    The AIJ format, also called
37422ef1f0ffSBarry Smith    compressed row storage, is fully compatible with standard Fortran
37439ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
374420f4b53cSBarry Smith    either one (as in Fortran) or zero.
37459ae82921SPaul Mullowney 
37469ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
37472ef1f0ffSBarry Smith    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
374820f4b53cSBarry Smith    allocation.
37499ae82921SPaul Mullowney 
37502ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
37519ae82921SPaul Mullowney @*/
3752d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3753d71ae5a4SJacob Faibussowitsch {
37549ae82921SPaul Mullowney   PetscFunctionBegin;
37559566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
37569566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
37579566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
37589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
37593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37609ae82921SPaul Mullowney }
37619ae82921SPaul Mullowney 
3762d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3763d71ae5a4SJacob Faibussowitsch {
37649ae82921SPaul Mullowney   PetscFunctionBegin;
37659ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
37669566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
37679ae82921SPaul Mullowney   } else {
37689566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3769aa372e3fSPaul Mullowney   }
37709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
37719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
37729566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
37739566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
37749566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
37759566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
37769566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
37779566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
37789566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
37799566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
37809566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
37813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37829ae82921SPaul Mullowney }
37839ae82921SPaul Mullowney 
3784ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
378595639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3786d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3787d71ae5a4SJacob Faibussowitsch {
37889ff858a8SKarl Rupp   PetscFunctionBegin;
37899566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
37909566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
37913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37929ff858a8SKarl Rupp }
37939ff858a8SKarl Rupp 
3794d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3795d71ae5a4SJacob Faibussowitsch {
3796a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3797039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3798039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3799039c6fbaSStefano Zampini   PetscScalar        *ay;
3800039c6fbaSStefano Zampini   const PetscScalar  *ax;
3801039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3802e6e9a74fSStefano Zampini 
380395639643SRichard Tran Mills   PetscFunctionBegin;
3804a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3805a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3806039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38089566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38093ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
381095639643SRichard Tran Mills   }
3811039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38145f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38155f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3816039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3817039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3818039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3819039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3820039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3821ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3822039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3823039c6fbaSStefano Zampini   }
3824d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3825d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3826039c6fbaSStefano Zampini 
3827039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3828039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3829039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3830039c6fbaSStefano Zampini     size_t bufferSize;
3831039c6fbaSStefano Zampini     void  *buffer;
3832039c6fbaSStefano Zampini #endif
3833039c6fbaSStefano Zampini 
38349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38369566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3837039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38389371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38399371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
38409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
38419566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38429371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38439371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
38449566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38459566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
38469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3847039c6fbaSStefano Zampini #else
38489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38499371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38509371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
38519566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38529566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3853039c6fbaSStefano Zampini #endif
38549566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
38559566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
38569566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38579566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3858039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3859a587d139SMark     cublasHandle_t cublasv2handle;
3860a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3861039c6fbaSStefano Zampini 
38629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38649566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
38659566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
38669566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38679566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
38689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
38699566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
38709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
38719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3873039c6fbaSStefano Zampini   } else {
38749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38759566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3876a587d139SMark   }
38773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
387895639643SRichard Tran Mills }
387995639643SRichard Tran Mills 
3880d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3881d71ae5a4SJacob Faibussowitsch {
388233c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
388333c9ba73SStefano Zampini   PetscScalar   *ay;
388433c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
388533c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
388633c9ba73SStefano Zampini 
388733c9ba73SStefano Zampini   PetscFunctionBegin;
38889566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38899566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
38909566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
38919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
38929566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
38939566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
38949566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
38959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
38973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
389833c9ba73SStefano Zampini }
389933c9ba73SStefano Zampini 
3900d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3901d71ae5a4SJacob Faibussowitsch {
39027e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3903a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
39047e8381f9SStefano Zampini 
39053fa6b06aSMark Adams   PetscFunctionBegin;
39063fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39073fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39087e8381f9SStefano Zampini     if (spptr->mat) {
39097e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39107e8381f9SStefano Zampini       if (matrix->values) {
39117e8381f9SStefano Zampini         both = PETSC_TRUE;
39127e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39137e8381f9SStefano Zampini       }
39147e8381f9SStefano Zampini     }
39157e8381f9SStefano Zampini     if (spptr->matTranspose) {
39167e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3917ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39187e8381f9SStefano Zampini     }
39193fa6b06aSMark Adams   }
39209566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
39219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
39227e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3923a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
39243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39253fa6b06aSMark Adams }
39263fa6b06aSMark Adams 
3927d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3928d71ae5a4SJacob Faibussowitsch {
3929a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3930a587d139SMark 
3931a587d139SMark   PetscFunctionBegin;
39329a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
39339a14fc28SStefano Zampini     A->boundtocpu = flg;
39343ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
39359a14fc28SStefano Zampini   }
3936a587d139SMark   if (flg) {
39379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3938a587d139SMark 
393933c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3940a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3941a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3942a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3943a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3944a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3945a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3946a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3947a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3948fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
39499566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
39509566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
39519566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
39529566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
39539566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
39549566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
39559566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3956a587d139SMark   } else {
395733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3958a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3959a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3960a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3961a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3962a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3963a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3964a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3965a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3966fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
396767a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
396867a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
396967a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
397067a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
397167a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
397267a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
39737ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
39747ee59b9bSJunchao Zhang 
39759566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
39769566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
39779566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
39789566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
39799566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
39809566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3981a587d139SMark   }
3982a587d139SMark   A->boundtocpu = flg;
3983ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3984ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3985ea500dcfSRichard Tran Mills   } else {
3986ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3987ea500dcfSRichard Tran Mills   }
39883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3989a587d139SMark }
3990a587d139SMark 
39918eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3992d71ae5a4SJacob Faibussowitsch {
399349735bf3SStefano Zampini   Mat B;
39949ae82921SPaul Mullowney 
39959ae82921SPaul Mullowney   PetscFunctionBegin;
39969566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
399749735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
39989566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
399949735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
40009566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
400149735bf3SStefano Zampini   }
400249735bf3SStefano Zampini   B = *newmat;
400349735bf3SStefano Zampini 
40049566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40059566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
400634136279SStefano Zampini 
400749735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40089ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4009e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40109566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40119566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40129566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40131a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4014d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4015*b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4016a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4017a435da06SStefano Zampini   #else
4018d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4019a435da06SStefano Zampini   #endif
4020d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4021d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4022d8132acaSStefano Zampini #endif
40231a2c6b5cSJunchao Zhang       B->spptr = spptr;
40249ae82921SPaul Mullowney     } else {
4025e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4026e6e9a74fSStefano Zampini 
40279566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40289566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40299566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4030e6e9a74fSStefano Zampini       B->spptr = spptr;
40319ae82921SPaul Mullowney     }
4032e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
403349735bf3SStefano Zampini   }
4034693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
40359ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
40361a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
40379ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
403895639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4039693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
40402205254eSKarl Rupp 
40419566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
40429566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
40439566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4044ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
40459566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4046ae48a8d0SStefano Zampini #endif
40479566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
40483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40499ae82921SPaul Mullowney }
40509ae82921SPaul Mullowney 
4051d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4052d71ae5a4SJacob Faibussowitsch {
405302fe1965SBarry Smith   PetscFunctionBegin;
40549566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
40559566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
40563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
405702fe1965SBarry Smith }
405802fe1965SBarry Smith 
40593ca39a21SBarry Smith /*MC
4060e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4061e057df02SPaul Mullowney 
406211a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
406311a5261eSBarry Smith    CSR, ELL, or Hybrid format.
406411a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4065e057df02SPaul Mullowney 
4066e057df02SPaul Mullowney    Options Database Keys:
406711a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
40682ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
40692ef1f0ffSBarry Smith                                       Other options include ell (ellpack) or hyb (hybrid).
40702ef1f0ffSBarry Smith .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
40712ef1f0ffSBarry Smith -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4072e057df02SPaul Mullowney 
4073e057df02SPaul Mullowney   Level: beginner
4074e057df02SPaul Mullowney 
40752ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4076e057df02SPaul Mullowney M*/
40777f756511SDominic Meiser 
4078bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
40790f39cd5aSBarry Smith 
4080d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4081d71ae5a4SJacob Faibussowitsch {
408242c9c57cSBarry Smith   PetscFunctionBegin;
40839566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
40849566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
40859566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
40869566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
40879566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
4088bddcd29dSMark Adams 
40893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
409042c9c57cSBarry Smith }
409129b38603SBarry Smith 
4092d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4093d71ae5a4SJacob Faibussowitsch {
4094cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
4095cbc6b225SStefano Zampini 
4096cbc6b225SStefano Zampini   PetscFunctionBegin;
40973ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4098cbc6b225SStefano Zampini   delete cusp->cooPerm;
4099cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
4100cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
4101cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
4102cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
41039566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
41049566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
4105cbc6b225SStefano Zampini   }
4106cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
41073ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4108cbc6b225SStefano Zampini }
4109cbc6b225SStefano Zampini 
4110d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
4111d71ae5a4SJacob Faibussowitsch {
41127f756511SDominic Meiser   PetscFunctionBegin;
41137f756511SDominic Meiser   if (*cusparsestruct) {
41149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
41159566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
41167f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
411781902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
41187e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
41197e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
4120a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
41219566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
41229566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
41239566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
41249566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
41257f756511SDominic Meiser   }
41263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41277f756511SDominic Meiser }
41287f756511SDominic Meiser 
4129d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4130d71ae5a4SJacob Faibussowitsch {
41317f756511SDominic Meiser   PetscFunctionBegin;
41327f756511SDominic Meiser   if (*mat) {
41337f756511SDominic Meiser     delete (*mat)->values;
41347f756511SDominic Meiser     delete (*mat)->column_indices;
41357f756511SDominic Meiser     delete (*mat)->row_offsets;
41367f756511SDominic Meiser     delete *mat;
41377f756511SDominic Meiser     *mat = 0;
41387f756511SDominic Meiser   }
41393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41407f756511SDominic Meiser }
41417f756511SDominic Meiser 
4142*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4143d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4144d71ae5a4SJacob Faibussowitsch {
41457f756511SDominic Meiser   PetscFunctionBegin;
41467f756511SDominic Meiser   if (*trifactor) {
41479566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4148261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
41499566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
41509566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
41519566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4152afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
41539566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4154afb2bd1cSJunchao Zhang   #endif
41559566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
41567f756511SDominic Meiser   }
41573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41587f756511SDominic Meiser }
4159d460d7bfSJunchao Zhang #endif
41607f756511SDominic Meiser 
4161d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4162d71ae5a4SJacob Faibussowitsch {
41637f756511SDominic Meiser   CsrMatrix *mat;
41647f756511SDominic Meiser 
41657f756511SDominic Meiser   PetscFunctionBegin;
41667f756511SDominic Meiser   if (*matstruct) {
41677f756511SDominic Meiser     if ((*matstruct)->mat) {
41687f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4169afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4170afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4171afb2bd1cSJunchao Zhang #else
41727f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
41739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4174afb2bd1cSJunchao Zhang #endif
41757f756511SDominic Meiser       } else {
41767f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
41773ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
41787f756511SDominic Meiser       }
41797f756511SDominic Meiser     }
41809566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
41817f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
41829566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
41839566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
41849566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4185afb2bd1cSJunchao Zhang 
4186afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4187afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
41889566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4189afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4190afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
41919566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
41929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
41939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4194afb2bd1cSJunchao Zhang       }
4195afb2bd1cSJunchao Zhang     }
4196afb2bd1cSJunchao Zhang #endif
41977f756511SDominic Meiser     delete *matstruct;
41987e8381f9SStefano Zampini     *matstruct = NULL;
41997f756511SDominic Meiser   }
42003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42017f756511SDominic Meiser }
42027f756511SDominic Meiser 
4203d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4204d71ae5a4SJacob Faibussowitsch {
4205da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4206da112707SJunchao Zhang 
42077f756511SDominic Meiser   PetscFunctionBegin;
4208da112707SJunchao Zhang   if (fs) {
4209*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4210da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4211da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4212da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4213da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4214d460d7bfSJunchao Zhang     delete fs->workVector;
4215d460d7bfSJunchao Zhang     fs->workVector = NULL;
4216d460d7bfSJunchao Zhang #endif
4217da112707SJunchao Zhang     delete fs->rpermIndices;
4218da112707SJunchao Zhang     delete fs->cpermIndices;
4219da112707SJunchao Zhang     fs->rpermIndices = NULL;
4220da112707SJunchao Zhang     fs->cpermIndices = NULL;
4221da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
4222da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
4223da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4224*b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4225da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4226da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
422730807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
422830807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4229da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4230d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4231da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4232da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
423312ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4234da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4235da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
423612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4237da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4238da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4239da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4240da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4241da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4242da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4243da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4244da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4245da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4246da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4247da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4248da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4249d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4250d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4251d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
425212ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
425312ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4254da112707SJunchao Zhang #endif
4255ccdfe979SStefano Zampini   }
42563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4257ccdfe979SStefano Zampini }
4258ccdfe979SStefano Zampini 
4259d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4260d71ae5a4SJacob Faibussowitsch {
4261ccdfe979SStefano Zampini   PetscFunctionBegin;
4262ccdfe979SStefano Zampini   if (*trifactors) {
42639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4264f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
42659566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
42667f756511SDominic Meiser   }
42673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42687f756511SDominic Meiser }
42697e8381f9SStefano Zampini 
42709371c9d4SSatish Balay struct IJCompare {
4271d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4272d71ae5a4SJacob Faibussowitsch   {
42737e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
42747e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
42757e8381f9SStefano Zampini     return false;
42767e8381f9SStefano Zampini   }
42777e8381f9SStefano Zampini };
42787e8381f9SStefano Zampini 
42799371c9d4SSatish Balay struct IJEqual {
4280d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4281d71ae5a4SJacob Faibussowitsch   {
42827e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
42837e8381f9SStefano Zampini     return true;
42847e8381f9SStefano Zampini   }
42857e8381f9SStefano Zampini };
42867e8381f9SStefano Zampini 
42879371c9d4SSatish Balay struct IJDiff {
42889371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
42897e8381f9SStefano Zampini };
42907e8381f9SStefano Zampini 
42919371c9d4SSatish Balay struct IJSum {
42929371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
42937e8381f9SStefano Zampini };
42947e8381f9SStefano Zampini 
42957e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
4296219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4297d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4298d71ae5a4SJacob Faibussowitsch {
42997e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4300fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4301bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
430208391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
43037e8381f9SStefano Zampini   CsrMatrix                            *matrix;
43047e8381f9SStefano Zampini   PetscInt                              n;
43057e8381f9SStefano Zampini 
43067e8381f9SStefano Zampini   PetscFunctionBegin;
430728b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
430828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
43097e8381f9SStefano Zampini   if (!cusp->cooPerm) {
43109566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
43119566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
43123ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
43137e8381f9SStefano Zampini   }
43147e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
431528b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4316e61fc153SStefano Zampini   if (!v) {
4317e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4318e61fc153SStefano Zampini     goto finalize;
43197e8381f9SStefano Zampini   }
4320e61fc153SStefano Zampini   n = cusp->cooPerm->size();
432108391a17SStefano Zampini   if (isCudaMem(v)) {
432208391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
432308391a17SStefano Zampini   } else {
4324e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4325e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
432608391a17SStefano Zampini     d_v = cooPerm_v->data();
43279566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
432808391a17SStefano Zampini   }
43299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4330e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4331ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4332bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
433308391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4334ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4335ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4336ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4337ddea5d60SJunchao Zhang       */
4338e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4339e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4340e61fc153SStefano Zampini       delete cooPerm_w;
43417e8381f9SStefano Zampini     } else {
4342ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
43439371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
43449371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4345ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
43467e8381f9SStefano Zampini     }
43477e8381f9SStefano Zampini   } else {
4348e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
434908391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4350e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
43517e8381f9SStefano Zampini     } else {
43529371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
43539371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
43547e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
43557e8381f9SStefano Zampini     }
43567e8381f9SStefano Zampini   }
43579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4358e61fc153SStefano Zampini finalize:
4359e61fc153SStefano Zampini   delete cooPerm_v;
43607e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
43619566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4362fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
43639566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
43649566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
43659566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4366fcdce8c4SStefano Zampini   a->reallocs = 0;
4367fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4368fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4369fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4370fcdce8c4SStefano Zampini   A->num_ass++;
43713ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43727e8381f9SStefano Zampini }
43737e8381f9SStefano Zampini 
4374d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4375d71ae5a4SJacob Faibussowitsch {
4376a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4377a49f1ed0SStefano Zampini 
4378a49f1ed0SStefano Zampini   PetscFunctionBegin;
4379a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43803ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4381a49f1ed0SStefano Zampini   if (destroy) {
43829566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4383a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4384a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4385a49f1ed0SStefano Zampini   }
43861a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
43873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4388a49f1ed0SStefano Zampini }
4389a49f1ed0SStefano Zampini 
43907e8381f9SStefano Zampini #include <thrust/binary_search.h>
4391219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4392d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4393d71ae5a4SJacob Faibussowitsch {
43947e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43957e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
43967e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
43977e8381f9SStefano Zampini 
43987e8381f9SStefano Zampini   PetscFunctionBegin;
43999566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
44009566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
44017e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
44027e8381f9SStefano Zampini   if (n != cooPerm_n) {
44037e8381f9SStefano Zampini     delete cusp->cooPerm;
44047e8381f9SStefano Zampini     delete cusp->cooPerm_a;
44057e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
44067e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
44077e8381f9SStefano Zampini   }
44087e8381f9SStefano Zampini   if (n) {
4409e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4410e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4411e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4412e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4413e8729f6fSJunchao Zhang 
4414e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4415e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4416e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4417e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4418e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4419e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4420e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4421e8729f6fSJunchao Zhang     } else {
4422e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4423e8729f6fSJunchao Zhang     }
4424e8729f6fSJunchao Zhang 
4425e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4426e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4427e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4428e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4429e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4430e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4431e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4432e8729f6fSJunchao Zhang     } else {
4433e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4434e8729f6fSJunchao Zhang     }
4435e8729f6fSJunchao Zhang 
44367e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
44377e8381f9SStefano Zampini 
4438ad540459SPierre Jolivet     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4439ad540459SPierre Jolivet     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
44407e8381f9SStefano Zampini 
4441ddea5d60SJunchao Zhang     /* Ex.
4442ddea5d60SJunchao Zhang       n = 6
4443ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4444ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4445ddea5d60SJunchao Zhang     */
4446e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4447e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
44487e8381f9SStefano Zampini 
44499566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
44507e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4451ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4452e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4453e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
44547e8381f9SStefano Zampini 
4455ddea5d60SJunchao Zhang     /*
4456ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4457ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4458ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4459ddea5d60SJunchao Zhang     */
4460ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4461ddea5d60SJunchao Zhang 
4462ddea5d60SJunchao Zhang     /*
4463ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4464ddea5d60SJunchao Zhang                             ^ekey
4465ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4466ddea5d60SJunchao Zhang                            ^nekye
4467ddea5d60SJunchao Zhang     */
44687e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
44697e8381f9SStefano Zampini       delete cusp->cooPerm_a;
44707e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4471ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4472ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4473ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4474ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4475ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
44767e8381f9SStefano Zampini       w[0]                  = 0;
4477ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4478ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
44797e8381f9SStefano Zampini     }
44807e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4481e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4482ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4483ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
44849566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
44857e8381f9SStefano Zampini 
44869566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
44877e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
44887e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
44897e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
44909566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4491ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
44929566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
44937e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4494fcdce8c4SStefano Zampini     a->rmax          = 0;
44959566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
44969566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4497e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
44989566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
44999566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
45007e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
45017e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
45027e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
45037e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4504fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
45057e8381f9SStefano Zampini     }
4506fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
45077e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
45089566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
45099566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4510e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4511e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
45127e8381f9SStefano Zampini   } else {
45139566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
45147e8381f9SStefano Zampini   }
45159566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
45167e8381f9SStefano Zampini 
45177e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4518e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
45199566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
45209566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
45217e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
45229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
45239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
45243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45257e8381f9SStefano Zampini }
4526ed502f03SStefano Zampini 
4527d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4528d71ae5a4SJacob Faibussowitsch {
4529219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4530219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4531cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4532219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4533219fbbafSJunchao Zhang 
4534219fbbafSJunchao Zhang   PetscFunctionBegin;
45359566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
45369566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4537219fbbafSJunchao Zhang   if (coo_i) {
45389566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4539219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4540219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
45419371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
45429371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
45439371c9d4SSatish Balay           break;
45449371c9d4SSatish Balay         }
4545219fbbafSJunchao Zhang       }
4546219fbbafSJunchao Zhang     }
4547219fbbafSJunchao Zhang   }
4548219fbbafSJunchao Zhang 
4549219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
45509566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4551219fbbafSJunchao Zhang   } else {
45529566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4553cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
45549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4555219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4556219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
45579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
45589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
45599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
45609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4561219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4562219fbbafSJunchao Zhang   }
45633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4564219fbbafSJunchao Zhang }
4565219fbbafSJunchao Zhang 
4566d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4567d71ae5a4SJacob Faibussowitsch {
4568219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4569219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4570b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4571b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4572b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4573b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4574b6c38306SJunchao Zhang   }
4575219fbbafSJunchao Zhang }
4576219fbbafSJunchao Zhang 
4577d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4578d71ae5a4SJacob Faibussowitsch {
4579219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4580219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4581219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4582219fbbafSJunchao Zhang   PetscMemType        memtype;
4583219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4584219fbbafSJunchao Zhang   PetscScalar        *Aa;
4585219fbbafSJunchao Zhang 
4586219fbbafSJunchao Zhang   PetscFunctionBegin;
4587219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
45889566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4589219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
45909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
45919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4592219fbbafSJunchao Zhang     }
4593219fbbafSJunchao Zhang 
45949566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
45959566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4596219fbbafSJunchao Zhang 
4597cbc6b225SStefano Zampini     if (Annz) {
4598b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
45999566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4600cbc6b225SStefano Zampini     }
4601219fbbafSJunchao Zhang 
46029566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
46039566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4604219fbbafSJunchao Zhang 
46059566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4606219fbbafSJunchao Zhang   } else {
46079566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4608219fbbafSJunchao Zhang   }
46093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4610219fbbafSJunchao Zhang }
4611219fbbafSJunchao Zhang 
46125b7e41feSStefano Zampini /*@C
46132ef1f0ffSBarry Smith     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
46145b7e41feSStefano Zampini 
46152ef1f0ffSBarry Smith    Not Collective
46165b7e41feSStefano Zampini 
46175b7e41feSStefano Zampini     Input Parameters:
46185b7e41feSStefano Zampini +   A - the matrix
461911a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
46205b7e41feSStefano Zampini 
46215b7e41feSStefano Zampini     Output Parameters:
462220f4b53cSBarry Smith +   i - the CSR row pointers
462320f4b53cSBarry Smith -   j - the CSR column indices
46245b7e41feSStefano Zampini 
46255b7e41feSStefano Zampini     Level: developer
46265b7e41feSStefano Zampini 
462711a5261eSBarry Smith     Note:
46285b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
46295b7e41feSStefano Zampini 
46302ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
46315b7e41feSStefano Zampini @*/
4632d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4633d71ae5a4SJacob Faibussowitsch {
46345f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
46355f101d05SStefano Zampini   CsrMatrix          *csr;
46365f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
46375f101d05SStefano Zampini 
46385f101d05SStefano Zampini   PetscFunctionBegin;
46395f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46403ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
46415f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4642aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
46439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
464428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
46455f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
46465f101d05SStefano Zampini   if (i) {
46475f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
46485f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
46495f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
46505f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
46519566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
46525f101d05SStefano Zampini       }
46535f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
46545f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
46555f101d05SStefano Zampini   }
46565f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
46573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46585f101d05SStefano Zampini }
46595f101d05SStefano Zampini 
46605b7e41feSStefano Zampini /*@C
46612ef1f0ffSBarry Smith     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
46625b7e41feSStefano Zampini 
46632ef1f0ffSBarry Smith    Not Collective
46645b7e41feSStefano Zampini 
46655b7e41feSStefano Zampini     Input Parameters:
46665b7e41feSStefano Zampini +   A - the matrix
46672ef1f0ffSBarry Smith .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
466820f4b53cSBarry Smith .   i - the CSR row pointers
466920f4b53cSBarry Smith -   j - the CSR column indices
46705b7e41feSStefano Zampini 
46715b7e41feSStefano Zampini     Level: developer
46725b7e41feSStefano Zampini 
46732ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
46745b7e41feSStefano Zampini @*/
467520f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4676d71ae5a4SJacob Faibussowitsch {
46775f101d05SStefano Zampini   PetscFunctionBegin;
46785f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46795f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46805f101d05SStefano Zampini   if (i) *i = NULL;
46815f101d05SStefano Zampini   if (j) *j = NULL;
468220f4b53cSBarry Smith   (void)compressed;
46833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46845f101d05SStefano Zampini }
46855f101d05SStefano Zampini 
46865b7e41feSStefano Zampini /*@C
468711a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46885b7e41feSStefano Zampini 
46895b7e41feSStefano Zampini    Not Collective
46905b7e41feSStefano Zampini 
46915b7e41feSStefano Zampini    Input Parameter:
469211a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
46935b7e41feSStefano Zampini 
46945b7e41feSStefano Zampini    Output Parameter:
46955b7e41feSStefano Zampini .   a - pointer to the device data
46965b7e41feSStefano Zampini 
46975b7e41feSStefano Zampini    Level: developer
46985b7e41feSStefano Zampini 
469911a5261eSBarry Smith    Note:
470011a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
47015b7e41feSStefano Zampini 
47022ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
47035b7e41feSStefano Zampini @*/
4704d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4705d71ae5a4SJacob Faibussowitsch {
4706ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4707ed502f03SStefano Zampini   CsrMatrix          *csr;
4708ed502f03SStefano Zampini 
4709ed502f03SStefano Zampini   PetscFunctionBegin;
4710ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4711ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4712ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4713aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
47149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
471528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4716ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
471728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4718ed502f03SStefano Zampini   *a = csr->values->data().get();
47193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4720ed502f03SStefano Zampini }
4721ed502f03SStefano Zampini 
47225b7e41feSStefano Zampini /*@C
472311a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
47245b7e41feSStefano Zampini 
47255b7e41feSStefano Zampini    Not Collective
47265b7e41feSStefano Zampini 
47272ef1f0ffSBarry Smith    Input Parameters:
47282ef1f0ffSBarry Smith +   A - a `MATSEQAIJCUSPARSE` matrix
47292ef1f0ffSBarry Smith -   a - pointer to the device data
47305b7e41feSStefano Zampini 
47315b7e41feSStefano Zampini    Level: developer
47325b7e41feSStefano Zampini 
47332ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
47345b7e41feSStefano Zampini @*/
4735d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4736d71ae5a4SJacob Faibussowitsch {
4737ed502f03SStefano Zampini   PetscFunctionBegin;
4738ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4739ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4740ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4741ed502f03SStefano Zampini   *a = NULL;
47423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4743ed502f03SStefano Zampini }
4744ed502f03SStefano Zampini 
47455b7e41feSStefano Zampini /*@C
474611a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
47475b7e41feSStefano Zampini 
47485b7e41feSStefano Zampini    Not Collective
47495b7e41feSStefano Zampini 
47505b7e41feSStefano Zampini    Input Parameter:
475111a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
47525b7e41feSStefano Zampini 
47535b7e41feSStefano Zampini    Output Parameter:
47545b7e41feSStefano Zampini .   a - pointer to the device data
47555b7e41feSStefano Zampini 
47565b7e41feSStefano Zampini    Level: developer
47575b7e41feSStefano Zampini 
475811a5261eSBarry Smith    Note:
475911a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
47605b7e41feSStefano Zampini 
47612ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
47625b7e41feSStefano Zampini @*/
4763d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4764d71ae5a4SJacob Faibussowitsch {
4765039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4766039c6fbaSStefano Zampini   CsrMatrix          *csr;
4767039c6fbaSStefano Zampini 
4768039c6fbaSStefano Zampini   PetscFunctionBegin;
4769039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4770039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4771039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4772aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
47739566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
477428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4775039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
477628b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4777039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4778039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
47799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
47803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4781039c6fbaSStefano Zampini }
47825b7e41feSStefano Zampini /*@C
478311a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4784039c6fbaSStefano Zampini 
47855b7e41feSStefano Zampini    Not Collective
47865b7e41feSStefano Zampini 
47872ef1f0ffSBarry Smith    Input Parameters:
47882ef1f0ffSBarry Smith +   A - a `MATSEQAIJCUSPARSE` matrix
47892ef1f0ffSBarry Smith -   a - pointer to the device data
47905b7e41feSStefano Zampini 
47915b7e41feSStefano Zampini    Level: developer
47925b7e41feSStefano Zampini 
47932ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
47945b7e41feSStefano Zampini @*/
4795d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4796d71ae5a4SJacob Faibussowitsch {
4797039c6fbaSStefano Zampini   PetscFunctionBegin;
4798039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4799039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4800039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
48019566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
48029566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4803039c6fbaSStefano Zampini   *a = NULL;
48043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4805039c6fbaSStefano Zampini }
4806039c6fbaSStefano Zampini 
48075b7e41feSStefano Zampini /*@C
480811a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
48095b7e41feSStefano Zampini 
48105b7e41feSStefano Zampini    Not Collective
48115b7e41feSStefano Zampini 
48125b7e41feSStefano Zampini    Input Parameter:
481311a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
48145b7e41feSStefano Zampini 
48155b7e41feSStefano Zampini    Output Parameter:
48165b7e41feSStefano Zampini .   a - pointer to the device data
48175b7e41feSStefano Zampini 
48185b7e41feSStefano Zampini    Level: developer
48195b7e41feSStefano Zampini 
482011a5261eSBarry Smith    Note:
482111a5261eSBarry Smith    Does not trigger host-device copies and flags data validity on the GPU
48225b7e41feSStefano Zampini 
48232ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
48245b7e41feSStefano Zampini @*/
4825d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4826d71ae5a4SJacob Faibussowitsch {
4827ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4828ed502f03SStefano Zampini   CsrMatrix          *csr;
4829ed502f03SStefano Zampini 
4830ed502f03SStefano Zampini   PetscFunctionBegin;
4831ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4832ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4833ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4834aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
483528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4836ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
483728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4838ed502f03SStefano Zampini   *a             = csr->values->data().get();
4839039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
48409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
48413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4842ed502f03SStefano Zampini }
4843ed502f03SStefano Zampini 
48445b7e41feSStefano Zampini /*@C
484511a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
48465b7e41feSStefano Zampini 
48475b7e41feSStefano Zampini    Not Collective
48485b7e41feSStefano Zampini 
48492ef1f0ffSBarry Smith    Input Parameters:
48502ef1f0ffSBarry Smith +   A - a `MATSEQAIJCUSPARSE` matrix
48512ef1f0ffSBarry Smith -   a - pointer to the device data
48525b7e41feSStefano Zampini 
48535b7e41feSStefano Zampini    Level: developer
48545b7e41feSStefano Zampini 
48552ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
48565b7e41feSStefano Zampini @*/
4857d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4858d71ae5a4SJacob Faibussowitsch {
4859ed502f03SStefano Zampini   PetscFunctionBegin;
4860ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4861ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4862ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
48639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
48649566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4865ed502f03SStefano Zampini   *a = NULL;
48663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4867ed502f03SStefano Zampini }
4868ed502f03SStefano Zampini 
48699371c9d4SSatish Balay struct IJCompare4 {
4870d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4871d71ae5a4SJacob Faibussowitsch   {
4872ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4873ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4874ed502f03SStefano Zampini     return false;
4875ed502f03SStefano Zampini   }
4876ed502f03SStefano Zampini };
4877ed502f03SStefano Zampini 
48789371c9d4SSatish Balay struct Shift {
4879ed502f03SStefano Zampini   int _shift;
4880ed502f03SStefano Zampini 
4881ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
48829371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4883ed502f03SStefano Zampini };
4884ed502f03SStefano Zampini 
4885ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4886d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4887d71ae5a4SJacob Faibussowitsch {
4888ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4889ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4890ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4891ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4892ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4893ed502f03SStefano Zampini   cusparseStatus_t              stat;
4894ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4895ed502f03SStefano Zampini 
4896ed502f03SStefano Zampini   PetscFunctionBegin;
4897ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4898ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4899ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4900ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4901ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
49025f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
490308401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4904aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4905aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4906ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4907ed502f03SStefano Zampini     m = A->rmap->n;
4908ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
49099566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
49109566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
49119566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4912ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4913ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4914ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4915ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4916ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4917ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4918ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4919ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4920ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4921ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4922ed502f03SStefano Zampini     Ccusp->nrows            = m;
4923ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4924ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4925ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4926ed502f03SStefano Zampini     Ccsr->num_cols          = n;
49279566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
49289566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
49299566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
49309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
49319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
49329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
49339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
49369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
493828b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
493928b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4940ed502f03SStefano Zampini 
4941ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4942ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4943ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4944ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4945ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4946ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4947ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4948ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4949ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4950ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4951ed502f03SStefano Zampini     if (c->nz) {
49522ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
49532ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
49542ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
49552ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
49562ed87e7eSStefano Zampini 
4957ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4958ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4959ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4960ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
49619566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4962ed502f03SStefano Zampini         }
49632ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
49642ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4965ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4966ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4967ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4968ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
49699566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4970ed502f03SStefano Zampini         }
49712ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
49722ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
49739566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49749371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
49759371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
49769371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
49779371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
49782ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
49792ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
49802ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
49818909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4982ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4983ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
49848909a122SStefano Zampini #else
49858909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
49868909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
49878909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
49888909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
49898909a122SStefano Zampini #endif
49902ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
49912ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
49922ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
49932ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
49942ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
49952ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4996ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4997ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4998ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4999792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
50008909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
50018909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
50028909a122SStefano Zampini #endif
50032ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
50042ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
50052ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
5006792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
50072ed87e7eSStefano Zampini #else
50082ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
5009792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
5010792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
50112ed87e7eSStefano Zampini #endif
50129371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
50139371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
50149566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
50152ed87e7eSStefano Zampini       delete wPerm;
50162ed87e7eSStefano Zampini       delete Acoo;
50172ed87e7eSStefano Zampini       delete Bcoo;
50182ed87e7eSStefano Zampini       delete Ccoo;
5019ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
50209371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
50219371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
5022ed502f03SStefano Zampini #endif
50231a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
50249566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
50259566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5026ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5027ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5028ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
5029ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5030ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5031ed502f03SStefano Zampini 
50321a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
50331a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
5034a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
5035ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
5036ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
5037ed502f03SStefano Zampini         CcsrT->num_rows               = n;
5038ed502f03SStefano Zampini         CcsrT->num_cols               = m;
5039ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
5040ed502f03SStefano Zampini 
5041ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
5042ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5043ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
5044ed502f03SStefano Zampini 
50459566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
5046ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
5047ed502f03SStefano Zampini         if (AT) {
5048ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
5049ed502f03SStefano Zampini           thrust::advance(rT, -1);
5050ed502f03SStefano Zampini         }
5051ed502f03SStefano Zampini         if (BT) {
5052ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
5053ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
5054ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
5055ed502f03SStefano Zampini         }
5056ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
5057ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
5058ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
5059ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
5060ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5061ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
50629566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
5063ed502f03SStefano Zampini 
50649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
50659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
50669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
50679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
50689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
50699566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
50709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
50719566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
50729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
5073ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
50749371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
50759371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
5076ed502f03SStefano Zampini #endif
5077ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
5078ed502f03SStefano Zampini       }
5079ed502f03SStefano Zampini     }
5080ed502f03SStefano Zampini 
5081ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
5082ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
5083ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
50849566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
50859566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
5086ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5087ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5088ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5089ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
5090ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
50919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
50929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5093ed502f03SStefano Zampini     } else {
50949566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
50959566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
5096ed502f03SStefano Zampini     }
50979566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
50989566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
50999566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
5100ed502f03SStefano Zampini     c->maxnz         = c->nz;
5101ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
5102ed502f03SStefano Zampini     c->rmax          = 0;
5103ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
5104ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
5105ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
5106ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
5107ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
5108ed502f03SStefano Zampini     }
51099566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
51109566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
5111ed502f03SStefano Zampini     (*C)->nonzerostate++;
51129566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
51139566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
5114ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
5115ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
5116ed502f03SStefano Zampini   } else {
511708401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
5118ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
5119ed502f03SStefano Zampini     if (c->nz) {
5120ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
51215f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
5122aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
512308401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
51249566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
51259566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
51265f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
51275f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
5128ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
5129ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
5130ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
5131aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
5132aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
5133aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
5134aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
51355f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
5136ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
5137ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
51389566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
51399371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
51409371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
5141ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
51429371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
51439371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
5144ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
51459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
51461a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
51475f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5148ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5149ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
5150ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
5151ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
5152ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
5153ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
5154ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
51551a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
5156ed502f03SStefano Zampini       }
51579566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
5158ed502f03SStefano Zampini     }
5159ed502f03SStefano Zampini   }
51609566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5161ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
5162ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
5163ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
51643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5165ed502f03SStefano Zampini }
5166c215019aSStefano Zampini 
5167d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5168d71ae5a4SJacob Faibussowitsch {
5169c215019aSStefano Zampini   bool               dmem;
5170c215019aSStefano Zampini   const PetscScalar *av;
5171c215019aSStefano Zampini 
5172c215019aSStefano Zampini   PetscFunctionBegin;
5173c215019aSStefano Zampini   dmem = isCudaMem(v);
51749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
5175c215019aSStefano Zampini   if (n && idx) {
5176c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
5177c215019aSStefano Zampini     widx.assign(idx, idx + n);
51789566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5179c215019aSStefano Zampini 
5180c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5181c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5182c215019aSStefano Zampini     if (dmem) {
5183c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5184c215019aSStefano Zampini     } else {
5185c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5186c215019aSStefano Zampini       dv = w->data();
5187c215019aSStefano Zampini     }
5188c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5189c215019aSStefano Zampini 
5190c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5191c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5192c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
519348a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5194c215019aSStefano Zampini     delete w;
5195c215019aSStefano Zampini   } else {
51969566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5197c215019aSStefano Zampini   }
51989566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
51999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
52003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5201c215019aSStefano Zampini }
5202