xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bdb0d812d7d16c312f15c80a34abe98d607d2193)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
19d0967f54SJacob Faibussowitsch #endif
20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
21a2cee5feSJed Brown #include <thrust/remove.h>
22a2cee5feSJed Brown #include <thrust/sort.h>
23a2cee5feSJed Brown #include <thrust/unique.h>
24e8d2b73aSMark Adams 
25e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
35afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
36afb2bd1cSJunchao Zhang 
37afb2bd1cSJunchao Zhang   typedef enum {
38afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
50afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
51afb2bd1cSJunchao Zhang 
52afb2bd1cSJunchao Zhang   typedef enum {
5335cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
5435cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
56afb2bd1cSJunchao Zhang   */
57afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60afb2bd1cSJunchao Zhang #endif
619ae82921SPaul Mullowney 
62087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
656fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
66b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
69d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
72d460d7bfSJunchao Zhang #endif
73dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
74a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7533c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
766fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
786fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
839ae82921SPaul Mullowney 
847f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
872c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat);
887f756511SDominic Meiser 
8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9157181aedSStefano Zampini 
92c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
93e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
94219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
95c215019aSStefano Zampini 
96d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
97d71ae5a4SJacob Faibussowitsch {
98aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
996e111a19SKarl Rupp 
100ca45077fSPaul Mullowney   PetscFunctionBegin;
101ca45077fSPaul Mullowney   switch (op) {
102d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
103d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
104d71ae5a4SJacob Faibussowitsch     break;
105d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
106d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
107d71ae5a4SJacob Faibussowitsch     break;
108d71ae5a4SJacob Faibussowitsch   default:
109d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
110ca45077fSPaul Mullowney   }
1113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
112ca45077fSPaul Mullowney }
1139ae82921SPaul Mullowney 
114e057df02SPaul Mullowney /*@
11511a5261eSBarry Smith   MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11611a5261eSBarry Smith   operation. Only the `MatMult()` operation can use different GPU storage formats
11711a5261eSBarry Smith 
118e057df02SPaul Mullowney   Not Collective
119e057df02SPaul Mullowney 
120e057df02SPaul Mullowney   Input Parameters:
12111a5261eSBarry Smith + A      - Matrix of type `MATSEQAIJCUSPARSE`
1222ef1f0ffSBarry Smith . op     - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
1232ef1f0ffSBarry Smith         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12411a5261eSBarry Smith - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
125e057df02SPaul Mullowney 
126e057df02SPaul Mullowney   Level: intermediate
127e057df02SPaul Mullowney 
128fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129e057df02SPaul Mullowney @*/
130d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
131d71ae5a4SJacob Faibussowitsch {
132e057df02SPaul Mullowney   PetscFunctionBegin;
133e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
134cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
136e057df02SPaul Mullowney }
137e057df02SPaul Mullowney 
138d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
139d71ae5a4SJacob Faibussowitsch {
140365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
141365b711fSMark Adams 
142365b711fSMark Adams   PetscFunctionBegin;
143365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1443ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145365b711fSMark Adams }
146365b711fSMark Adams 
147365b711fSMark Adams /*@
14811a5261eSBarry Smith   MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
149365b711fSMark Adams 
150365b711fSMark Adams   Input Parameters:
15111a5261eSBarry Smith + A       - Matrix of type `MATSEQAIJCUSPARSE`
15211a5261eSBarry Smith - use_cpu - set flag for using the built-in CPU `MatSolve()`
153365b711fSMark Adams 
1542ef1f0ffSBarry Smith   Level: intermediate
155365b711fSMark Adams 
15611a5261eSBarry Smith   Note:
157365b711fSMark Adams   The cuSparse LU solver currently computes the factors with the built-in CPU method
158365b711fSMark Adams   and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159365b711fSMark Adams   This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160365b711fSMark Adams 
1611cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
162365b711fSMark Adams @*/
163d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
164d71ae5a4SJacob Faibussowitsch {
165365b711fSMark Adams   PetscFunctionBegin;
166365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
167cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
169365b711fSMark Adams }
170365b711fSMark Adams 
17166976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
172d71ae5a4SJacob Faibussowitsch {
173e6e9a74fSStefano Zampini   PetscFunctionBegin;
1741a2c6b5cSJunchao Zhang   switch (op) {
1751a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1761a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1779566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1781a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1791a2c6b5cSJunchao Zhang     break;
180d71ae5a4SJacob Faibussowitsch   default:
181d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
182d71ae5a4SJacob Faibussowitsch     break;
183e6e9a74fSStefano Zampini   }
1843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
185e6e9a74fSStefano Zampini }
186e6e9a74fSStefano Zampini 
187d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
188d71ae5a4SJacob Faibussowitsch {
189e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
1909ae82921SPaul Mullowney   PetscBool                flg;
191a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1926e111a19SKarl Rupp 
1939ae82921SPaul Mullowney   PetscFunctionBegin;
194d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
1959ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
1969371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
1979566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
198afb2bd1cSJunchao Zhang 
1999371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2009566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2019566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2029566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
203afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2049371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
205afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
206b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
207aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
208a435da06SStefano Zampini   #else
209aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
210a435da06SStefano Zampini   #endif
2119371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
212aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
213afb2bd1cSJunchao Zhang 
2149371c9d4SSatish Balay     PetscCall(
2159371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
216aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
217afb2bd1cSJunchao Zhang #endif
2184c87dfd4SPaul Mullowney   }
219d0609cedSBarry Smith   PetscOptionsHeadEnd();
2203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2219ae82921SPaul Mullowney }
2229ae82921SPaul Mullowney 
223b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
224d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A)
225d460d7bfSJunchao Zhang {
226d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
227d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
228d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
229d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
230d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
231d460d7bfSJunchao Zhang   PetscInt                     *Mi, *Mj, Mnz;
232d460d7bfSJunchao Zhang   PetscScalar                  *Ma;
233d460d7bfSJunchao Zhang 
234d460d7bfSJunchao Zhang   PetscFunctionBegin;
235d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
236d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0
237d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host
238d460d7bfSJunchao Zhang       Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal)
239d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &Mi));
240d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp
241d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
242d460d7bfSJunchao Zhang       Mi[0] = 0;
243d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
244d460d7bfSJunchao Zhang         PetscInt llen = Ai[i + 1] - Ai[i];
245d460d7bfSJunchao Zhang         PetscInt ulen = Adiag[i] - Adiag[i + 1];
246d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen));                           // entries of L
247d460d7bfSJunchao Zhang         Mj[Mi[i] + llen] = i;                                                             // diagonal entry
248d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
249d460d7bfSJunchao Zhang         Mi[i + 1] = Mi[i] + llen + ulen;
250d460d7bfSJunchao Zhang       }
251d460d7bfSJunchao Zhang       // Copy M (L,U) from host to device
252f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
253f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
254f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
255f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice));
256f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice));
257d460d7bfSJunchao Zhang 
258d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
259d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
260d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
261d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
262d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
263d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_LOWER;
264d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT;
265d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
266d460d7bfSJunchao Zhang 
267d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
268d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
269d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
270d460d7bfSJunchao Zhang 
271d460d7bfSJunchao Zhang       fillMode = CUSPARSE_FILL_MODE_UPPER;
272d460d7bfSJunchao Zhang       diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
273d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
274d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
275d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
276d460d7bfSJunchao Zhang 
277d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
278f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
279f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
280d460d7bfSJunchao Zhang 
281d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
282d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
283d460d7bfSJunchao Zhang 
284d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE
285d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
286d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
287d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
288d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
289d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
290d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
291d460d7bfSJunchao Zhang 
292d460d7bfSJunchao Zhang       // Record for reuse
293d460d7bfSJunchao Zhang       fs->csrRowPtr_h = Mi;
294d460d7bfSJunchao Zhang       fs->csrVal_h    = Ma;
295d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
296d460d7bfSJunchao Zhang     }
297d460d7bfSJunchao Zhang     // Copy the value
298d460d7bfSJunchao Zhang     Mi  = fs->csrRowPtr_h;
299d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
300d460d7bfSJunchao Zhang     Mnz = Mi[m];
301d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
302d460d7bfSJunchao Zhang       PetscInt llen = Ai[i + 1] - Ai[i];
303d460d7bfSJunchao Zhang       PetscInt ulen = Adiag[i] - Adiag[i + 1];
304d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen));                           // entries of L
305d460d7bfSJunchao Zhang       Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]];                                 // recover the diagonal entry
306d460d7bfSJunchao Zhang       PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
307d460d7bfSJunchao Zhang     }
308d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
309d460d7bfSJunchao Zhang 
310d460d7bfSJunchao Zhang     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
311d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
312d460d7bfSJunchao Zhang 
313d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
314d460d7bfSJunchao Zhang 
315d460d7bfSJunchao Zhang     // L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve
316d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
317d460d7bfSJunchao Zhang   }
318d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
319d460d7bfSJunchao Zhang }
320d460d7bfSJunchao Zhang #else
321d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
322d71ae5a4SJacob Faibussowitsch {
3239ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3249ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3259ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
326aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
3279ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
3289ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3299ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3309ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
3319ae82921SPaul Mullowney 
3329ae82921SPaul Mullowney   PetscFunctionBegin;
3333ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
334c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3359ae82921SPaul Mullowney     try {
3369ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3379ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
338da79fbbcSStefano Zampini       if (!loTriFactor) {
3392cbc15d9SMark         PetscScalar *AALo;
3402cbc15d9SMark 
3419566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
3429ae82921SPaul Mullowney 
3439ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3449566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
3459566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
3469ae82921SPaul Mullowney 
3479ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3489ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
3499ae82921SPaul Mullowney         AiLo[n]   = nzLower;
3509ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
3519ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
3529ae82921SPaul Mullowney         v         = aa;
3539ae82921SPaul Mullowney         vi        = aj;
3549ae82921SPaul Mullowney         offset    = 1;
3559ae82921SPaul Mullowney         rowOffset = 1;
3569ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3579ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
358e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3599ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3609ae82921SPaul Mullowney           rowOffset += nz + 1;
3619ae82921SPaul Mullowney 
362f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjLo[offset], vi, nz));
363f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AALo[offset], v, nz));
3649ae82921SPaul Mullowney 
3659ae82921SPaul Mullowney           offset += nz;
3669ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3679ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3689ae82921SPaul Mullowney           offset += 1;
3699ae82921SPaul Mullowney 
3709ae82921SPaul Mullowney           v += nz;
3719ae82921SPaul Mullowney           vi += nz;
3729ae82921SPaul Mullowney         }
3732205254eSKarl Rupp 
374aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3759566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
376da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
377aa372e3fSPaul Mullowney         /* Create the matrix description */
3789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3801b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
382afb2bd1cSJunchao Zhang   #else
3839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
384afb2bd1cSJunchao Zhang   #endif
3859566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
387aa372e3fSPaul Mullowney 
388aa372e3fSPaul Mullowney         /* set the operation */
389aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
390aa372e3fSPaul Mullowney 
391aa372e3fSPaul Mullowney         /* set the matrix */
392aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
393aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
394aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
395aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
396aa372e3fSPaul Mullowney 
397aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
398aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
399aa372e3fSPaul Mullowney 
400aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
401aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
402aa372e3fSPaul Mullowney 
403aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
404aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
405aa372e3fSPaul Mullowney 
406afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4079566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
408261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
4091b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4109371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4119371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
4129566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
413afb2bd1cSJunchao Zhang   #endif
414afb2bd1cSJunchao Zhang 
415aa372e3fSPaul Mullowney         /* perform the solve analysis */
4169371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
4179f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
4189566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4199566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
420aa372e3fSPaul Mullowney 
421da79fbbcSStefano Zampini         /* assign the pointer */
422aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
4232cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
4249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4269566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
427da79fbbcSStefano Zampini       } else { /* update values only */
42848a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
429da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4302cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
431da79fbbcSStefano Zampini         v                    = aa;
432da79fbbcSStefano Zampini         vi                   = aj;
433da79fbbcSStefano Zampini         offset               = 1;
434da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
435da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
436f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz));
437da79fbbcSStefano Zampini           offset += nz;
4382cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
439da79fbbcSStefano Zampini           offset += 1;
440da79fbbcSStefano Zampini           v += nz;
441da79fbbcSStefano Zampini         }
4422cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
4439566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
444da79fbbcSStefano Zampini       }
445d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
446d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
447d71ae5a4SJacob Faibussowitsch     }
4489ae82921SPaul Mullowney   }
4493ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4509ae82921SPaul Mullowney }
4519ae82921SPaul Mullowney 
452d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
453d71ae5a4SJacob Faibussowitsch {
4549ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4559ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4569ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
457aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4589ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4599ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4609ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4619ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4629ae82921SPaul Mullowney 
4639ae82921SPaul Mullowney   PetscFunctionBegin;
4643ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
465c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4669ae82921SPaul Mullowney     try {
4679ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4689ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
469da79fbbcSStefano Zampini       if (!upTriFactor) {
4702cbc15d9SMark         PetscScalar *AAUp;
4712cbc15d9SMark 
4729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4732cbc15d9SMark 
4749ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4779ae82921SPaul Mullowney 
4789ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4799ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4809ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4819ae82921SPaul Mullowney         offset  = nzUpper;
4829ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4839ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4849ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4859ae82921SPaul Mullowney 
486e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4879ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4889ae82921SPaul Mullowney 
489e057df02SPaul Mullowney           /* decrement the offset */
4909ae82921SPaul Mullowney           offset -= (nz + 1);
4919ae82921SPaul Mullowney 
492e057df02SPaul Mullowney           /* first, set the diagonal elements */
4939ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
49409f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4959ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4969ae82921SPaul Mullowney 
497f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz));
498f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz));
4999ae82921SPaul Mullowney         }
5002205254eSKarl Rupp 
501aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5029566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
503da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5042205254eSKarl Rupp 
505aa372e3fSPaul Mullowney         /* Create the matrix description */
5069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5079566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5081b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
510afb2bd1cSJunchao Zhang   #else
5119566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
512afb2bd1cSJunchao Zhang   #endif
5139566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
515aa372e3fSPaul Mullowney 
516aa372e3fSPaul Mullowney         /* set the operation */
517aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
518aa372e3fSPaul Mullowney 
519aa372e3fSPaul Mullowney         /* set the matrix */
520aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
521aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
522aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
523aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
524aa372e3fSPaul Mullowney 
525aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
526aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
527aa372e3fSPaul Mullowney 
528aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
529aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
530aa372e3fSPaul Mullowney 
531aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
532aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
533aa372e3fSPaul Mullowney 
534afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
5359566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
536261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
5371b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
5389371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5399371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
5409566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
541afb2bd1cSJunchao Zhang   #endif
542afb2bd1cSJunchao Zhang 
543aa372e3fSPaul Mullowney         /* perform the solve analysis */
5449371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
5459f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
5469f7ba44dSJacob Faibussowitsch 
5479566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5489566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
549aa372e3fSPaul Mullowney 
550da79fbbcSStefano Zampini         /* assign the pointer */
551aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5522cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5559566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
556da79fbbcSStefano Zampini       } else {
55748a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
558da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
559da79fbbcSStefano Zampini         offset = nzUpper;
560da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
561da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
562da79fbbcSStefano Zampini 
563da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
564da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
565da79fbbcSStefano Zampini 
566da79fbbcSStefano Zampini           /* decrement the offset */
567da79fbbcSStefano Zampini           offset -= (nz + 1);
568da79fbbcSStefano Zampini 
569da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5702cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
571f4f49eeaSPierre Jolivet           PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz));
572da79fbbcSStefano Zampini         }
5732cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5749566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
575da79fbbcSStefano Zampini       }
576d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
577d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
578d71ae5a4SJacob Faibussowitsch     }
5799ae82921SPaul Mullowney   }
5803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5819ae82921SPaul Mullowney }
582d460d7bfSJunchao Zhang #endif
5839ae82921SPaul Mullowney 
584d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
585d71ae5a4SJacob Faibussowitsch {
5869ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5879ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5889ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5899ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5909ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5919ae82921SPaul Mullowney 
5929ae82921SPaul Mullowney   PetscFunctionBegin;
59328b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
594b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
595d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A));
596d460d7bfSJunchao Zhang #else
5979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
599ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
600d460d7bfSJunchao Zhang #endif
601d460d7bfSJunchao Zhang 
602aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
6039ae82921SPaul Mullowney 
604d460d7bfSJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU
605e057df02SPaul Mullowney   /* lower triangular indices */
6069566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
607da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
608da79fbbcSStefano Zampini     const PetscInt *r;
609da79fbbcSStefano Zampini 
6109566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
611aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
612aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
6139566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
6149566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
615da79fbbcSStefano Zampini   }
6169ae82921SPaul Mullowney 
617e057df02SPaul Mullowney   /* upper triangular indices */
6189566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
619da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
620da79fbbcSStefano Zampini     const PetscInt *c;
621da79fbbcSStefano Zampini 
6229566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
623aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
624aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
6259566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
6269566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
627da79fbbcSStefano Zampini   }
6283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6299ae82921SPaul Mullowney }
6309ae82921SPaul Mullowney 
631b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
632d460d7bfSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A)
633d460d7bfSJunchao Zhang {
634d460d7bfSJunchao Zhang   Mat_SeqAIJ                   *a  = static_cast<Mat_SeqAIJ *>(A->data);
635d460d7bfSJunchao Zhang   PetscInt                      m  = A->rmap->n;
636d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
637d460d7bfSJunchao Zhang   const PetscInt               *Ai = a->i, *Aj = a->j, *Adiag = a->diag;
638d460d7bfSJunchao Zhang   const MatScalar              *Aa = a->a;
639d460d7bfSJunchao Zhang   PetscInt                     *Mj, Mnz;
640d460d7bfSJunchao Zhang   PetscScalar                  *Ma, *D;
641d460d7bfSJunchao Zhang 
642d460d7bfSJunchao Zhang   PetscFunctionBegin;
643d460d7bfSJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU
644d460d7bfSJunchao Zhang     if (!fs->csrRowPtr) {                    // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0
645d460d7bfSJunchao Zhang       // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host.
646d460d7bfSJunchao Zhang       // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host.
647d460d7bfSJunchao Zhang       Mnz = Ai[m]; // Unz (with the unit diagonal)
648d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Ma));
649d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp
650d460d7bfSJunchao Zhang       PetscCall(PetscMalloc1(m, &D));    // the diagonal
651d460d7bfSJunchao Zhang       for (PetscInt i = 0; i < m; i++) {
652d460d7bfSJunchao Zhang         PetscInt ulen = Ai[i + 1] - Ai[i];
653d460d7bfSJunchao Zhang         Mj[Ai[i]]     = i;                                              // diagonal entry
654d460d7bfSJunchao Zhang         PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal
655d460d7bfSJunchao Zhang       }
656d460d7bfSJunchao Zhang       // Copy M (U) from host to device
657f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1)));
658f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz));
659f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz));
660f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m));
661d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice));
662d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice));
663d460d7bfSJunchao Zhang 
664d460d7bfSJunchao Zhang       // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
665d460d7bfSJunchao Zhang       // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
666d460d7bfSJunchao Zhang       // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
667d460d7bfSJunchao Zhang       // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
668d460d7bfSJunchao Zhang       // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
669d460d7bfSJunchao Zhang       cusparseFillMode_t        fillMode  = CUSPARSE_FILL_MODE_UPPER;
670d460d7bfSJunchao Zhang       cusparseDiagType_t        diagType  = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal
671d460d7bfSJunchao Zhang       const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I;
672d460d7bfSJunchao Zhang 
673d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
674d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
675d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
676d460d7bfSJunchao Zhang 
677d460d7bfSJunchao Zhang       // Allocate work vectors in SpSv
678f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m));
679f4f49eeaSPierre Jolivet       PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m));
680d460d7bfSJunchao Zhang 
681d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
682d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
683d460d7bfSJunchao Zhang 
684d460d7bfSJunchao Zhang       // Query buffer sizes for SpSV and then allocate buffers
685d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
686d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
687d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
688d460d7bfSJunchao Zhang 
689aaa8cc7dSPierre Jolivet       PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer
690d460d7bfSJunchao Zhang       PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
691d460d7bfSJunchao Zhang       PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
692d460d7bfSJunchao Zhang 
693d460d7bfSJunchao Zhang       // Record for reuse
694d460d7bfSJunchao Zhang       fs->csrVal_h = Ma;
695d460d7bfSJunchao Zhang       fs->diag_h   = D;
696d460d7bfSJunchao Zhang       PetscCall(PetscFree(Mj));
697d460d7bfSJunchao Zhang     }
698d460d7bfSJunchao Zhang     // Copy the value
699d460d7bfSJunchao Zhang     Ma  = fs->csrVal_h;
700d460d7bfSJunchao Zhang     D   = fs->diag_h;
701d460d7bfSJunchao Zhang     Mnz = Ai[m];
702d460d7bfSJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
703d460d7bfSJunchao Zhang       D[i]      = Aa[Adiag[i]];   // actually Aa[Adiag[i]] is the inverse of the diagonal
704d460d7bfSJunchao Zhang       Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT
705d460d7bfSJunchao Zhang       for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k];
706d460d7bfSJunchao Zhang     }
707d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice));
708d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice));
709d460d7bfSJunchao Zhang 
710d460d7bfSJunchao Zhang     // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values
711d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
712d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
713d460d7bfSJunchao Zhang   }
714d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
715d460d7bfSJunchao Zhang }
716d460d7bfSJunchao Zhang 
717d460d7bfSJunchao Zhang // Solve Ut D U x = b
718d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x)
719d460d7bfSJunchao Zhang {
720d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
721d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
722d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
723d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
724d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
725d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
726d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
727d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
728d460d7bfSJunchao Zhang 
729d460d7bfSJunchao Zhang   PetscFunctionBegin;
730d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
731d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
732d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
733d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
734d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
735d460d7bfSJunchao Zhang 
736d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
737d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
738d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
739d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
740d460d7bfSJunchao Zhang   } else {
741d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
742d460d7bfSJunchao Zhang   }
743d460d7bfSJunchao Zhang 
744d460d7bfSJunchao Zhang   // Solve Ut Y = X
745d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
746d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
747d460d7bfSJunchao Zhang 
748d460d7bfSJunchao Zhang   // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ().
749d460d7bfSJunchao Zhang   // It is basically a vector element-wise multiplication, but cublas does not have it!
750d460d7bfSJunchao Zhang   PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>()));
751d460d7bfSJunchao Zhang 
752d460d7bfSJunchao Zhang   // Solve U X = Y
753d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
754d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
755d460d7bfSJunchao Zhang   } else {
756d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
757d460d7bfSJunchao Zhang   }
758d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
759d460d7bfSJunchao Zhang 
760d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
761d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
762d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
763d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
764d460d7bfSJunchao Zhang   }
765d460d7bfSJunchao Zhang 
766d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
767d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
768d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
769d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n));
770d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
771d460d7bfSJunchao Zhang }
772d460d7bfSJunchao Zhang #else
773d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
774d71ae5a4SJacob Faibussowitsch {
775087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
776087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
777aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
778aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
779087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
780087f3262SPaul Mullowney   PetscScalar                       *AAUp;
781087f3262SPaul Mullowney   PetscScalar                       *AALo;
782087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
783087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
784087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
785087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
786087f3262SPaul Mullowney 
787087f3262SPaul Mullowney   PetscFunctionBegin;
7883ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
789c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
790087f3262SPaul Mullowney     try {
7919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
7929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
793da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
794087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
7969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
797087f3262SPaul Mullowney 
798087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
799087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
800087f3262SPaul Mullowney         AiUp[n] = nzUpper;
801087f3262SPaul Mullowney         offset  = 0;
802087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
803087f3262SPaul Mullowney           /* set the pointers */
804087f3262SPaul Mullowney           v  = aa + ai[i];
805087f3262SPaul Mullowney           vj = aj + ai[i];
806087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
807087f3262SPaul Mullowney 
808087f3262SPaul Mullowney           /* first, set the diagonal elements */
809087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
81009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
811087f3262SPaul Mullowney           AiUp[i]      = offset;
81209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
813087f3262SPaul Mullowney 
814087f3262SPaul Mullowney           offset += 1;
815087f3262SPaul Mullowney           if (nz > 0) {
816f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AjUp[offset], vj, nz));
817f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
818087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
819087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
820087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
821087f3262SPaul Mullowney             }
822087f3262SPaul Mullowney             offset += nz;
823087f3262SPaul Mullowney           }
824087f3262SPaul Mullowney         }
825087f3262SPaul Mullowney 
826aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8279566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
828da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
829087f3262SPaul Mullowney 
830aa372e3fSPaul Mullowney         /* Create the matrix description */
8319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8331b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
835afb2bd1cSJunchao Zhang   #else
8369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
837afb2bd1cSJunchao Zhang   #endif
8389566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
840087f3262SPaul Mullowney 
841aa372e3fSPaul Mullowney         /* set the matrix */
842aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
851aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
852aa372e3fSPaul Mullowney 
853aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
854aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
855aa372e3fSPaul Mullowney 
856afb2bd1cSJunchao Zhang         /* set the operation */
857afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
858afb2bd1cSJunchao Zhang 
859afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8609566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
861261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
8621b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8639371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8649371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
8659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
866afb2bd1cSJunchao Zhang   #endif
867afb2bd1cSJunchao Zhang 
868aa372e3fSPaul Mullowney         /* perform the solve analysis */
8699371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
8709f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
8719f7ba44dSJacob Faibussowitsch 
8729566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8739566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
874aa372e3fSPaul Mullowney 
875da79fbbcSStefano Zampini         /* assign the pointer */
876aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
877aa372e3fSPaul Mullowney 
878aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8799566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
880da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
881aa372e3fSPaul Mullowney 
882aa372e3fSPaul Mullowney         /* Create the matrix description */
8839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8851b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
887afb2bd1cSJunchao Zhang   #else
8889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
889afb2bd1cSJunchao Zhang   #endif
8909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
892aa372e3fSPaul Mullowney 
893aa372e3fSPaul Mullowney         /* set the operation */
894aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
895aa372e3fSPaul Mullowney 
896aa372e3fSPaul Mullowney         /* set the matrix */
897aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
898aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
899aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
900aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
904aa372e3fSPaul Mullowney 
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
907aa372e3fSPaul Mullowney 
908aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
909aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
910aa372e3fSPaul Mullowney 
911afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
9129566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
9141b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9159371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9169371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
9179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
918afb2bd1cSJunchao Zhang   #endif
919afb2bd1cSJunchao Zhang 
920aa372e3fSPaul Mullowney         /* perform the solve analysis */
9219371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
9229f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
9239f7ba44dSJacob Faibussowitsch 
9249566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
9259566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
926aa372e3fSPaul Mullowney 
927da79fbbcSStefano Zampini         /* assign the pointer */
928aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
929087f3262SPaul Mullowney 
9309566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
9319566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
9329566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
933da79fbbcSStefano Zampini       } else {
934da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
935da79fbbcSStefano Zampini         offset = 0;
936da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
937da79fbbcSStefano Zampini           /* set the pointers */
938da79fbbcSStefano Zampini           v  = aa + ai[i];
939da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
940da79fbbcSStefano Zampini 
941da79fbbcSStefano Zampini           /* first, set the diagonal elements */
942da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
943da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
944da79fbbcSStefano Zampini 
945da79fbbcSStefano Zampini           offset += 1;
946da79fbbcSStefano Zampini           if (nz > 0) {
947f4f49eeaSPierre Jolivet             PetscCall(PetscArraycpy(&AAUp[offset], v, nz));
948da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
949da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
950da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
951da79fbbcSStefano Zampini             }
952da79fbbcSStefano Zampini             offset += nz;
953da79fbbcSStefano Zampini           }
954da79fbbcSStefano Zampini         }
95528b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
95628b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
957da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
958da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
9599566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
960da79fbbcSStefano Zampini       }
9619566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9629566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
963d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
964d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
965d71ae5a4SJacob Faibussowitsch     }
966087f3262SPaul Mullowney   }
9673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
968087f3262SPaul Mullowney }
969d460d7bfSJunchao Zhang #endif
970087f3262SPaul Mullowney 
971d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
972d71ae5a4SJacob Faibussowitsch {
973087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
974087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
975087f3262SPaul Mullowney   IS                            ip                 = a->row;
976087f3262SPaul Mullowney   PetscBool                     perm_identity;
977087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
978087f3262SPaul Mullowney 
979087f3262SPaul Mullowney   PetscFunctionBegin;
98028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
981d460d7bfSJunchao Zhang 
982b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
983d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A));
984d460d7bfSJunchao Zhang #else
9859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
986ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
987d460d7bfSJunchao Zhang #endif
988aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
989aa372e3fSPaul Mullowney 
990da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
991da79fbbcSStefano Zampini 
992087f3262SPaul Mullowney   /* lower triangular indices */
9939566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
994087f3262SPaul Mullowney   if (!perm_identity) {
9954e4bbfaaSStefano Zampini     IS              iip;
996da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
9974e4bbfaaSStefano Zampini 
9989566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
9999566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
10009566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
1001aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1002aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
1003aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10044e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
10059566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
10069566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
10079566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
10089566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1009da79fbbcSStefano Zampini   }
10103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1011087f3262SPaul Mullowney }
1012087f3262SPaul Mullowney 
1013d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
1014d71ae5a4SJacob Faibussowitsch {
1015087f3262SPaul Mullowney   PetscFunctionBegin;
10169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
10179566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
1018ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1019d460d7bfSJunchao Zhang 
1020b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1021d460d7bfSJunchao Zhang   B->ops->solve          = MatSolve_SeqAIJCUSPARSE_Cholesky;
1022d460d7bfSJunchao Zhang   B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky;
1023d460d7bfSJunchao Zhang #else
1024087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1025d460d7bfSJunchao Zhang   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
1026d460d7bfSJunchao Zhang   IS          ip = b->row;
1027d460d7bfSJunchao Zhang   PetscBool   perm_identity;
1028d460d7bfSJunchao Zhang 
10299566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
1030087f3262SPaul Mullowney   if (perm_identity) {
1031087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1032087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
1033087f3262SPaul Mullowney   } else {
1034087f3262SPaul Mullowney     B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
1035087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
1036d460d7bfSJunchao Zhang   }
1037d460d7bfSJunchao Zhang #endif
10384e4bbfaaSStefano Zampini   B->ops->matsolve          = NULL;
10394e4bbfaaSStefano Zampini   B->ops->matsolvetranspose = NULL;
1040087f3262SPaul Mullowney 
1041087f3262SPaul Mullowney   /* get the triangular factors */
10429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
10433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1044087f3262SPaul Mullowney }
10459ae82921SPaul Mullowney 
1046b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
1047d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1048d71ae5a4SJacob Faibussowitsch {
1049bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1050aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1051aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1052da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1053da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1054aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
1055aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
1056aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
1057aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
1058b175d8bbSPaul Mullowney 
1059bda325fcSPaul Mullowney   PetscFunctionBegin;
1060aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10619566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1062da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1063aa372e3fSPaul Mullowney 
1064aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1065aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1066aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
10679371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1068aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney   /* Create the matrix description */
10719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10729566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10739566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10749566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10759566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1076aa372e3fSPaul Mullowney 
1077aa372e3fSPaul Mullowney   /* set the operation */
1078aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1079aa372e3fSPaul Mullowney 
1080aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1081aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
1082afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1083afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1084aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1085afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
1086afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1087afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1090afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10919371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
10929371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10939371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10949566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
1095afb2bd1cSJunchao Zhang   #endif
1096afb2bd1cSJunchao Zhang 
10979566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10989f7ba44dSJacob Faibussowitsch   {
10999f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11009f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
11019371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
1102afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11039f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
1104afb2bd1cSJunchao Zhang   #else
11059f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1106afb2bd1cSJunchao Zhang   #endif
11079f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11089f7ba44dSJacob Faibussowitsch   }
11099f7ba44dSJacob Faibussowitsch 
11109566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1112aa372e3fSPaul Mullowney 
1113afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11149566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1115261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
11161b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11179371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11189371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
11199566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
1120afb2bd1cSJunchao Zhang   #endif
1121afb2bd1cSJunchao Zhang 
1122afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11239371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
11249f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
11259f7ba44dSJacob Faibussowitsch 
11269566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11279566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1128aa372e3fSPaul Mullowney 
1129da79fbbcSStefano Zampini   /* assign the pointer */
1130aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1131aa372e3fSPaul Mullowney 
1132aa372e3fSPaul Mullowney   /*********************************************/
1133aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1134aa372e3fSPaul Mullowney   /*********************************************/
1135aa372e3fSPaul Mullowney 
1136aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11379566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1138da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1139aa372e3fSPaul Mullowney 
1140aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1141aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1142aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
11439371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1144aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
1145aa372e3fSPaul Mullowney 
1146aa372e3fSPaul Mullowney   /* Create the matrix description */
11479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11489566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11499566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11509566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11519566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1152aa372e3fSPaul Mullowney 
1153aa372e3fSPaul Mullowney   /* set the operation */
1154aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1155aa372e3fSPaul Mullowney 
1156aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1157aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
1158afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1159afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1160aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1161afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
1162afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1163afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1164aa372e3fSPaul Mullowney 
1165aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1166afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11679371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
11689371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11699371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11709566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
1171afb2bd1cSJunchao Zhang   #endif
1172afb2bd1cSJunchao Zhang 
11739566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
11749f7ba44dSJacob Faibussowitsch   {
11759f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
11769f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
11779371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
1178afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11799f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
1180afb2bd1cSJunchao Zhang   #else
11819f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
1182afb2bd1cSJunchao Zhang   #endif
11839f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
11849f7ba44dSJacob Faibussowitsch   }
1185d49cd2b7SBarry Smith 
11869566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11879566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1188aa372e3fSPaul Mullowney 
1189afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11909566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1191261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
11921b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
11939371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
11949371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
11959566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
1196afb2bd1cSJunchao Zhang   #endif
1197afb2bd1cSJunchao Zhang 
1198afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11995f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
12009371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12019f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1202d49cd2b7SBarry Smith 
12039566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
12049566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1205aa372e3fSPaul Mullowney 
1206da79fbbcSStefano Zampini   /* assign the pointer */
1207aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
12083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1209bda325fcSPaul Mullowney }
1210d460d7bfSJunchao Zhang #endif
1211bda325fcSPaul Mullowney 
12129371c9d4SSatish Balay struct PetscScalarToPetscInt {
12139371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1214a49f1ed0SStefano Zampini };
1215a49f1ed0SStefano Zampini 
1216d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1217d71ae5a4SJacob Faibussowitsch {
1218aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1219a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1220bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1221bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1222aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1223b175d8bbSPaul Mullowney 
1224bda325fcSPaul Mullowney   PetscFunctionBegin;
12259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1226a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
122728b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1228a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
122908401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
12303ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
12319566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
12329566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
123348a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1234a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1235aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12369566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1237aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12389566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12399566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1240aa372e3fSPaul Mullowney 
1241b06137fdSPaul Mullowney     /* set alpha and beta */
1242f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar)));
1243f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar)));
1244f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar)));
12459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
12479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1248b06137fdSPaul Mullowney 
1249aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1250aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1251a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1252554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1253554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1254aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1255a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1256aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1257aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1258a3fdcf43SKarl Rupp 
1259ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
126081902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1261afb2bd1cSJunchao Zhang 
1262afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
12633606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
12649371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12659371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
12669371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
12673606e59fSJunchao Zhang   #else
12683606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12693606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12703606e59fSJunchao Zhang 
12713606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12723606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12733606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12743606e59fSJunchao Zhang         */
12753606e59fSJunchao Zhang       if (matrixT->num_entries) {
12769371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
12779371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
12783606e59fSJunchao Zhang 
12793606e59fSJunchao Zhang       } else {
12803606e59fSJunchao Zhang         matstructT->matDescr = NULL;
12813606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
12823606e59fSJunchao Zhang       }
12833606e59fSJunchao Zhang   #endif
1284afb2bd1cSJunchao Zhang #endif
1285aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1286afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1287afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1288afb2bd1cSJunchao Zhang #else
1289aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
129051c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
129151c6d536SStefano Zampini       /* First convert HYB to CSR */
1292aa372e3fSPaul Mullowney       temp->num_rows       = A->rmap->n;
1293aa372e3fSPaul Mullowney       temp->num_cols       = A->cmap->n;
1294aa372e3fSPaul Mullowney       temp->num_entries    = a->nz;
1295aa372e3fSPaul Mullowney       temp->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1296aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1297aa372e3fSPaul Mullowney       temp->values         = new THRUSTARRAY(a->nz);
1298aa372e3fSPaul Mullowney 
12999371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
13009371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303aa372e3fSPaul Mullowney       tempT->num_rows       = A->rmap->n;
1304aa372e3fSPaul Mullowney       tempT->num_cols       = A->cmap->n;
1305aa372e3fSPaul Mullowney       tempT->num_entries    = a->nz;
1306aa372e3fSPaul Mullowney       tempT->row_offsets    = new THRUSTINTARRAY32(A->rmap->n + 1);
1307aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308aa372e3fSPaul Mullowney       tempT->values         = new THRUSTARRAY(a->nz);
1309aa372e3fSPaul Mullowney 
13109371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
13119371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13129371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1313aa372e3fSPaul Mullowney 
1314aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1315aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13169566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
13179371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
13189371c9d4SSatish Balay       stat                             = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
13199371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1320aa372e3fSPaul Mullowney 
1321aa372e3fSPaul Mullowney       /* assign the pointer */
1322aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13231a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1324aa372e3fSPaul Mullowney       /* delete temporaries */
1325aa372e3fSPaul Mullowney       if (tempT) {
1326aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1327aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1328aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1329aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1330087f3262SPaul Mullowney       }
1331aa372e3fSPaul Mullowney       if (temp) {
1332aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1333aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1334aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1335aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1336aa372e3fSPaul Mullowney       }
1337afb2bd1cSJunchao Zhang #endif
1338aa372e3fSPaul Mullowney     }
1339a49f1ed0SStefano Zampini   }
1340a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1341a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1342a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
134328b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
134428b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
134528b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
134628b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
134728b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
134828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
134928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
135028b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1351a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1352a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1353a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
13549566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1355a49f1ed0SStefano Zampini     }
1356a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1357a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1358792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1359a49f1ed0SStefano Zampini 
1360a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1361a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1362a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1363a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
13649371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
13659371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
13669371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
13679566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1368a49f1ed0SStefano Zampini #endif
1369a49f1ed0SStefano Zampini 
13701a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13711a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13721a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13731a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13741a2c6b5cSJunchao Zhang 
13751a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13761a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13771a2c6b5cSJunchao Zhang         */
13789371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1379a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13809371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
13819371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1382a49f1ed0SStefano Zampini #else
13839371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
13849371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1385a49f1ed0SStefano Zampini #endif
13861a2c6b5cSJunchao Zhang       } else {
13871a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
13881a2c6b5cSJunchao Zhang       }
13891a2c6b5cSJunchao Zhang 
1390a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1391792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1392a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
13939566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1394a49f1ed0SStefano Zampini #endif
1395a49f1ed0SStefano Zampini     }
13969371c9d4SSatish Balay     PetscCallThrust(
13979371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1398a49f1ed0SStefano Zampini   }
13999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14009566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1401213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1402213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1403aa372e3fSPaul Mullowney   /* assign the pointer */
1404aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
14051a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
14063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1407bda325fcSPaul Mullowney }
1408bda325fcSPaul Mullowney 
1409b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
1410d460d7bfSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1411d460d7bfSJunchao Zhang {
1412d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1413d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1414d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1415d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1416d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1417d460d7bfSJunchao Zhang   const Mat_SeqAIJ                     *aij = static_cast<Mat_SeqAIJ *>(A->data);
1418d460d7bfSJunchao Zhang   const cusparseOperation_t             op  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1419d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1420d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1421d460d7bfSJunchao Zhang 
1422d460d7bfSJunchao Zhang   PetscFunctionBegin;
1423d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1424d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1425d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1426d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1427d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1428d460d7bfSJunchao Zhang 
1429d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1430d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1431d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1432d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1433d460d7bfSJunchao Zhang   } else {
1434d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1435d460d7bfSJunchao Zhang   }
1436d460d7bfSJunchao Zhang 
1437d460d7bfSJunchao Zhang   // Solve L Y = X
1438d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1439d460d7bfSJunchao Zhang   // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()!
1440d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L));
1441d460d7bfSJunchao Zhang 
1442d460d7bfSJunchao Zhang   // Solve U X = Y
1443d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1444d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1445d460d7bfSJunchao Zhang   } else {
1446d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1447d460d7bfSJunchao Zhang   }
1448d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U));
1449d460d7bfSJunchao Zhang 
1450d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1451d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1452d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1453d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1454d460d7bfSJunchao Zhang   }
1455d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1456d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1457d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1458d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m));
1459d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1460d460d7bfSJunchao Zhang }
1461d460d7bfSJunchao Zhang 
1462d460d7bfSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x)
1463d460d7bfSJunchao Zhang {
1464d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors         *fs  = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr);
1465d460d7bfSJunchao Zhang   Mat_SeqAIJ                           *aij = static_cast<Mat_SeqAIJ *>(A->data);
1466d460d7bfSJunchao Zhang   const PetscScalar                    *barray;
1467d460d7bfSJunchao Zhang   PetscScalar                          *xarray;
1468d460d7bfSJunchao Zhang   thrust::device_ptr<const PetscScalar> bGPU;
1469d460d7bfSJunchao Zhang   thrust::device_ptr<PetscScalar>       xGPU;
1470d460d7bfSJunchao Zhang   const cusparseOperation_t             opA = CUSPARSE_OPERATION_TRANSPOSE;
1471d460d7bfSJunchao Zhang   const cusparseSpSVAlg_t               alg = CUSPARSE_SPSV_ALG_DEFAULT;
1472d460d7bfSJunchao Zhang   PetscInt                              m   = A->rmap->n;
1473d460d7bfSJunchao Zhang 
1474d460d7bfSJunchao Zhang   PetscFunctionBegin;
1475d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1476d460d7bfSJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time
1477d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1478d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
1479d460d7bfSJunchao Zhang                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1480d460d7bfSJunchao Zhang 
1481d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1482d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1483d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
1484d460d7bfSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
1485d460d7bfSJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
1486d460d7bfSJunchao Zhang   }
1487d460d7bfSJunchao Zhang 
1488d460d7bfSJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1489d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1490d460d7bfSJunchao Zhang 
1491d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
1492d460d7bfSJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1493d460d7bfSJunchao Zhang   }
1494d460d7bfSJunchao Zhang 
1495d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1496d460d7bfSJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1497d460d7bfSJunchao Zhang   xGPU = thrust::device_pointer_cast(xarray);
1498d460d7bfSJunchao Zhang   bGPU = thrust::device_pointer_cast(barray);
1499d460d7bfSJunchao Zhang 
1500d460d7bfSJunchao Zhang   // Reorder b with the row permutation if needed, and wrap the result in fs->X
1501d460d7bfSJunchao Zhang   if (fs->rpermIndices) {
1502d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X)));
1503d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1504d460d7bfSJunchao Zhang   } else {
1505d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1506d460d7bfSJunchao Zhang   }
1507d460d7bfSJunchao Zhang 
1508d460d7bfSJunchao Zhang   // Solve Ut Y = X
1509d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
1510d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut));
1511d460d7bfSJunchao Zhang 
1512d460d7bfSJunchao Zhang   // Solve Lt X = Y
1513d460d7bfSJunchao Zhang   if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X
1514d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X));
1515d460d7bfSJunchao Zhang   } else {
1516d460d7bfSJunchao Zhang     PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
1517d460d7bfSJunchao Zhang   }
1518d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt));
1519d460d7bfSJunchao Zhang 
1520d460d7bfSJunchao Zhang   // Reorder X with the column permutation if needed, and put the result back to x
1521d460d7bfSJunchao Zhang   if (fs->cpermIndices) {
1522d460d7bfSJunchao Zhang     PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()),
1523d460d7bfSJunchao Zhang                                  thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU));
1524d460d7bfSJunchao Zhang   }
1525d460d7bfSJunchao Zhang 
1526d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1527d460d7bfSJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1528d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1529d460d7bfSJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n));
1530d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1531d460d7bfSJunchao Zhang }
1532d460d7bfSJunchao Zhang #else
1533a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1534d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1535d71ae5a4SJacob Faibussowitsch {
1536c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1537465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1538465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1539465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1540465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1541bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1542aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1543aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1544aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1545bda325fcSPaul Mullowney 
1546bda325fcSPaul Mullowney   PetscFunctionBegin;
1547aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1548aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15499566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1550aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1551aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1552bda325fcSPaul Mullowney   }
1553bda325fcSPaul Mullowney 
1554bda325fcSPaul Mullowney   /* Get the GPU pointers */
15559566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
15569566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1557c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1558c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1559bda325fcSPaul Mullowney 
15609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1561aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
15629371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1563aa372e3fSPaul Mullowney 
1564aa372e3fSPaul Mullowney   /* First, solve U */
15659f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
15669f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1567aa372e3fSPaul Mullowney 
1568aa372e3fSPaul Mullowney   /* Then, solve L */
15699f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
15709f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1571aa372e3fSPaul Mullowney 
1572aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
15739371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1574aa372e3fSPaul Mullowney 
1575aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1576a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1577bda325fcSPaul Mullowney 
1578bda325fcSPaul Mullowney   /* restore */
15799566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
15809566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
15819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
15833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1584bda325fcSPaul Mullowney }
1585bda325fcSPaul Mullowney 
1586d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1587d71ae5a4SJacob Faibussowitsch {
1588465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1589465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1590bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1591aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1593aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1594bda325fcSPaul Mullowney 
1595bda325fcSPaul Mullowney   PetscFunctionBegin;
1596aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1597aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1599aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1600aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1601bda325fcSPaul Mullowney   }
1602bda325fcSPaul Mullowney 
1603bda325fcSPaul Mullowney   /* Get the GPU pointers */
16049566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16059566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1606bda325fcSPaul Mullowney 
16079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1608aa372e3fSPaul Mullowney   /* First, solve U */
16099f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
16109f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1611aa372e3fSPaul Mullowney 
1612aa372e3fSPaul Mullowney   /* Then, solve L */
16139f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
16149f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1615bda325fcSPaul Mullowney 
1616bda325fcSPaul Mullowney   /* restore */
16179566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16189566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1622bda325fcSPaul Mullowney }
1623bda325fcSPaul Mullowney 
1624d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1625d71ae5a4SJacob Faibussowitsch {
1626465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1627465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1628465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1629465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1631aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1632aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1633aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16349ae82921SPaul Mullowney 
16359ae82921SPaul Mullowney   PetscFunctionBegin;
1636e057df02SPaul Mullowney   /* Get the GPU pointers */
16379566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16389566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1639c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1640c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16419ae82921SPaul Mullowney 
16429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1643aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
16449371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1645aa372e3fSPaul Mullowney 
1646aa372e3fSPaul Mullowney   /* Next, solve L */
16479f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16489f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1649aa372e3fSPaul Mullowney 
1650aa372e3fSPaul Mullowney   /* Then, solve U */
16519f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16529f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1653d49cd2b7SBarry Smith 
16544e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16559371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
16569ae82921SPaul Mullowney 
16579566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16589566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16609566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16629ae82921SPaul Mullowney }
16639ae82921SPaul Mullowney 
1664d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1665d71ae5a4SJacob Faibussowitsch {
1666465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1667465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1669aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1670aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1671aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
16729ae82921SPaul Mullowney 
16739ae82921SPaul Mullowney   PetscFunctionBegin;
1674e057df02SPaul Mullowney   /* Get the GPU pointers */
16759566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
16769566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
16779ae82921SPaul Mullowney 
16789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1679aa372e3fSPaul Mullowney   /* First, solve L */
16809f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
16819f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1682d49cd2b7SBarry Smith 
1683aa372e3fSPaul Mullowney   /* Next, solve U */
16849f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
16859f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
16869ae82921SPaul Mullowney 
16879566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
16889566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
16899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
16913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
16929ae82921SPaul Mullowney }
1693d460d7bfSJunchao Zhang #endif
16949ae82921SPaul Mullowney 
1695b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
16968eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1697d71ae5a4SJacob Faibussowitsch {
1698da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1699da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1700da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1701da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1702da112707SJunchao Zhang   PetscInt                      m, nz;
1703da112707SJunchao Zhang   PetscBool                     flg;
1704da112707SJunchao Zhang 
1705da112707SJunchao Zhang   PetscFunctionBegin;
1706da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1707da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1708da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1709da112707SJunchao Zhang   }
1710da112707SJunchao Zhang 
1711da112707SJunchao Zhang   /* Copy A's value to fact */
1712da112707SJunchao Zhang   m  = fact->rmap->n;
1713da112707SJunchao Zhang   nz = aij->nz;
1714da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1715da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1716da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1717da112707SJunchao Zhang 
1718*bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeBegin());
1719da112707SJunchao Zhang   /* Factorize fact inplace */
17209371c9d4SSatish Balay   if (m)
17219371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1722d460d7bfSJunchao Zhang                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1723da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1724da112707SJunchao Zhang     int              numerical_zero;
1725da112707SJunchao Zhang     cusparseStatus_t status;
1726da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1727da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1728da112707SJunchao Zhang   }
1729da112707SJunchao Zhang 
173012ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
173112ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
173212ba2bc6SJunchao Zhang   */
17339371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1734da112707SJunchao Zhang 
17359371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1736da112707SJunchao Zhang 
173712ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
173812ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
173912ba2bc6SJunchao Zhang 
1740da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1741d460d7bfSJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t.
1742d460d7bfSJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_LU;
1743da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1744da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1745*bdb0d812SBarry Smith   PetscCall(PetscLogGpuTimeEnd());
1746da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1748da112707SJunchao Zhang }
1749da112707SJunchao Zhang 
17508eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1751d71ae5a4SJacob Faibussowitsch {
1752da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1753da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1754da112707SJunchao Zhang   PetscInt                      m, nz;
1755da112707SJunchao Zhang 
1756da112707SJunchao Zhang   PetscFunctionBegin;
1757da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1758da112707SJunchao Zhang     PetscInt  i;
1759da112707SJunchao Zhang     PetscBool flg, missing;
1760da112707SJunchao Zhang 
1761da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1762da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1763da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1764da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1765da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1766da112707SJunchao Zhang   }
1767da112707SJunchao Zhang 
1768da112707SJunchao Zhang   /* Free the old stale stuff */
1769da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1770da112707SJunchao Zhang 
1771da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1772da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1773da112707SJunchao Zhang    */
1774da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1775da112707SJunchao Zhang 
1776da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1777da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1778da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1779da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1780da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1781da112707SJunchao Zhang 
1782da112707SJunchao Zhang   aij->row = NULL;
1783da112707SJunchao Zhang   aij->col = NULL;
1784da112707SJunchao Zhang 
1785da112707SJunchao Zhang   /* ====================================================================== */
1786da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1787da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1788da112707SJunchao Zhang   /* ====================================================================== */
1789da112707SJunchao Zhang   const int *Ai, *Aj;
1790da112707SJunchao Zhang 
1791da112707SJunchao Zhang   m  = fact->rmap->n;
1792da112707SJunchao Zhang   nz = aij->nz;
1793da112707SJunchao Zhang 
1794f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
1795f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
1796f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz));
1797d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai.  The returned Ai, Aj are 32-bit */
1798d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1799d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1800da112707SJunchao Zhang 
1801da112707SJunchao Zhang   /* ====================================================================== */
1802da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1803da112707SJunchao Zhang   /* ====================================================================== */
1804da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1805da112707SJunchao Zhang   cusparseDiagType_t diagType;
1806da112707SJunchao Zhang 
1807da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1808da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1809da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1810da112707SJunchao Zhang 
1811da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1812da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1813da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1814da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1815da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1816da112707SJunchao Zhang   */
1817da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1818da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1819d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18209371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18219371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1822da112707SJunchao Zhang 
1823da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1824da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1825d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18269371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18279371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1828da112707SJunchao Zhang 
1829da112707SJunchao Zhang   /* ========================================================================= */
1830da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1831da112707SJunchao Zhang   /* ========================================================================= */
1832da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
18339371c9d4SSatish Balay   if (m)
18349371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1835d460d7bfSJunchao Zhang                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M));
1836da112707SJunchao Zhang 
1837da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1838da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1839da112707SJunchao Zhang 
1840da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1841da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1842da112707SJunchao Zhang 
1843da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18449371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1845da112707SJunchao Zhang 
1846da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
18479371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1848da112707SJunchao Zhang 
1849da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
185012ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
185112ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
185212ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1853da112707SJunchao Zhang    */
185412ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
185512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
185612ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1857da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
185812ba2bc6SJunchao Zhang   } else {
185912ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
186012ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1861da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
186212ba2bc6SJunchao Zhang   }
1863da112707SJunchao Zhang 
1864da112707SJunchao Zhang   /* ========================================================================== */
1865da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1866da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1867da112707SJunchao Zhang   /* ========================================================================== */
1868da112707SJunchao Zhang   int              structural_zero;
1869da112707SJunchao Zhang   cusparseStatus_t status;
1870da112707SJunchao Zhang 
1871da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18729371c9d4SSatish Balay   if (m)
18739371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1874d460d7bfSJunchao Zhang                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1875da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1876da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1877da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1878da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1879da112707SJunchao Zhang   }
1880da112707SJunchao Zhang 
1881da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18820dd8c0acSJunchao Zhang   {
1883da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18840dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1885da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1886da112707SJunchao Zhang 
1887da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1888da112707SJunchao Zhang     Ai    = Aseq->i;
1889da112707SJunchao Zhang     Adiag = Aseq->diag;
1890da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1891da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1892da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1893da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1894da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1895da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1896da112707SJunchao Zhang         */
1897da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1898da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1899da112707SJunchao Zhang       }
1900da112707SJunchao Zhang     }
1901da112707SJunchao Zhang     fs->numericFactFlops = flops;
19020dd8c0acSJunchao Zhang   }
1903da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
19043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1905da112707SJunchao Zhang }
1906da112707SJunchao Zhang 
1907d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1908d71ae5a4SJacob Faibussowitsch {
1909da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1910da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1911da112707SJunchao Zhang   const PetscScalar            *barray;
1912da112707SJunchao Zhang   PetscScalar                  *xarray;
1913da112707SJunchao Zhang 
1914da112707SJunchao Zhang   PetscFunctionBegin;
1915da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1916da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1917da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1918da112707SJunchao Zhang 
1919da112707SJunchao Zhang   /* Solve L*y = b */
1920da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1921da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
19229371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
19239371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1924da112707SJunchao Zhang 
1925da112707SJunchao Zhang   /* Solve Lt*x = y */
1926da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
19279371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
19289371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1929da112707SJunchao Zhang 
1930da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1931da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1932da112707SJunchao Zhang 
1933da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1934da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
19353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1936da112707SJunchao Zhang }
1937da112707SJunchao Zhang 
19388eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1939d71ae5a4SJacob Faibussowitsch {
1940da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1941da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1942da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1943da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1944da112707SJunchao Zhang   PetscInt                      m, nz;
1945da112707SJunchao Zhang   PetscBool                     flg;
1946da112707SJunchao Zhang 
1947da112707SJunchao Zhang   PetscFunctionBegin;
1948da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1949da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1950da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1951da112707SJunchao Zhang   }
1952da112707SJunchao Zhang 
1953da112707SJunchao Zhang   /* Copy A's value to fact */
1954da112707SJunchao Zhang   m  = fact->rmap->n;
1955da112707SJunchao Zhang   nz = aij->nz;
1956da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1957da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1958da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1959da112707SJunchao Zhang 
1960da112707SJunchao Zhang   /* Factorize fact inplace */
1961da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1962da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1963da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1964da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1965da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1966da112707SJunchao Zhang    */
1967d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1968da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1969da112707SJunchao Zhang     int              numerical_zero;
1970da112707SJunchao Zhang     cusparseStatus_t status;
1971da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1972da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1973da112707SJunchao Zhang   }
1974da112707SJunchao Zhang 
19759371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1976da112707SJunchao Zhang 
1977da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1978da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1979da112707SJunchao Zhang   */
19809371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1981da112707SJunchao Zhang 
1982da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1983da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1984da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1985da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1986da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1987da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
19883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1989da112707SJunchao Zhang }
1990da112707SJunchao Zhang 
19918eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1992d71ae5a4SJacob Faibussowitsch {
1993da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1994da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1995da112707SJunchao Zhang   PetscInt                      m, nz;
1996da112707SJunchao Zhang 
1997da112707SJunchao Zhang   PetscFunctionBegin;
1998da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1999da112707SJunchao Zhang     PetscInt  i;
2000da112707SJunchao Zhang     PetscBool flg, missing;
2001da112707SJunchao Zhang 
2002da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
2003da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
2004da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2005da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
2006da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
2007da112707SJunchao Zhang   }
2008da112707SJunchao Zhang 
2009da112707SJunchao Zhang   /* Free the old stale stuff */
2010da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2011da112707SJunchao Zhang 
2012da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2013da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2014da112707SJunchao Zhang    */
2015da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
2016da112707SJunchao Zhang 
2017da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2018da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2019da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2020da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2021da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2022da112707SJunchao Zhang 
2023da112707SJunchao Zhang   aij->row = NULL;
2024da112707SJunchao Zhang   aij->col = NULL;
2025da112707SJunchao Zhang 
2026da112707SJunchao Zhang   /* ====================================================================== */
2027da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2028da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2029da112707SJunchao Zhang   /* ====================================================================== */
2030da112707SJunchao Zhang   const int *Ai, *Aj;
2031da112707SJunchao Zhang 
2032da112707SJunchao Zhang   m  = fact->rmap->n;
2033da112707SJunchao Zhang   nz = aij->nz;
2034da112707SJunchao Zhang 
2035f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1)));
2036f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz));
2037da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
2038da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
2039d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2040d460d7bfSJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
2041da112707SJunchao Zhang 
2042da112707SJunchao Zhang   /* ====================================================================== */
2043da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2044da112707SJunchao Zhang   /* ====================================================================== */
2045da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2046da112707SJunchao Zhang   cusparseDiagType_t diagType;
2047da112707SJunchao Zhang 
2048da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2049da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2050da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2051da112707SJunchao Zhang 
2052da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2053da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2054da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2055da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2056da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2057da112707SJunchao Zhang   */
2058da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2059da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2060d460d7bfSJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
20619371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
20629371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
2063da112707SJunchao Zhang 
2064da112707SJunchao Zhang   /* ========================================================================= */
2065da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2066da112707SJunchao Zhang   /* ========================================================================= */
2067da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2068d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M));
2069da112707SJunchao Zhang 
2070da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
2071da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
2072da112707SJunchao Zhang 
2073da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
2074da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
2075da112707SJunchao Zhang 
2076da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
20779371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
2078da112707SJunchao Zhang 
2079da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
20809371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
2081da112707SJunchao Zhang 
208212ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
208312ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
208412ba2bc6SJunchao Zhang    */
208512ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
208612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
208712ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2088da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
208912ba2bc6SJunchao Zhang   } else {
209012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
209112ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
209212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
209312ba2bc6SJunchao Zhang   }
2094da112707SJunchao Zhang 
2095da112707SJunchao Zhang   /* ========================================================================== */
2096da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2097da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2098da112707SJunchao Zhang   /* ========================================================================== */
2099da112707SJunchao Zhang   int              structural_zero;
2100da112707SJunchao Zhang   cusparseStatus_t status;
2101da112707SJunchao Zhang 
2102da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2103d460d7bfSJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
2104da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2105da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2106da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2107da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
2108da112707SJunchao Zhang   }
2109da112707SJunchao Zhang 
2110da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
21110dd8c0acSJunchao Zhang   {
2112da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
21130dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
2114da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2115da112707SJunchao Zhang 
2116da112707SJunchao Zhang     Ai = Aseq->i;
2117da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
2118da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
2119da112707SJunchao Zhang       if (nzRow > 1) {
2120da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2121da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2122da112707SJunchao Zhang         */
2123da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
2124da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
2125da112707SJunchao Zhang       }
2126da112707SJunchao Zhang     }
2127da112707SJunchao Zhang     fs->numericFactFlops = flops;
21280dd8c0acSJunchao Zhang   }
2129da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
21303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2131da112707SJunchao Zhang }
2132da112707SJunchao Zhang #endif
2133da112707SJunchao Zhang 
2134d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
2135d460d7bfSJunchao Zhang {
2136b820271fSJunchao Zhang   // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors.
2137b820271fSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
2138d460d7bfSJunchao Zhang 
2139d460d7bfSJunchao Zhang   PetscFunctionBegin;
2140d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2141d460d7bfSJunchao Zhang   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
2142d460d7bfSJunchao Zhang   B->offloadmask = PETSC_OFFLOAD_CPU;
2143d460d7bfSJunchao Zhang 
2144d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) {
2145b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2146d460d7bfSJunchao Zhang     B->ops->solve          = MatSolve_SeqAIJCUSPARSE_LU;
2147d460d7bfSJunchao Zhang     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU;
2148d460d7bfSJunchao Zhang #else
2149d460d7bfSJunchao Zhang     /* determine which version of MatSolve needs to be used. */
2150d460d7bfSJunchao Zhang     Mat_SeqAIJ *b     = (Mat_SeqAIJ *)B->data;
2151d460d7bfSJunchao Zhang     IS          isrow = b->row, iscol = b->col;
2152d460d7bfSJunchao Zhang     PetscBool   row_identity, col_identity;
2153d460d7bfSJunchao Zhang 
2154d460d7bfSJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2155d460d7bfSJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2156d460d7bfSJunchao Zhang     if (row_identity && col_identity) {
2157d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
2158d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
2159d460d7bfSJunchao Zhang     } else {
2160d460d7bfSJunchao Zhang       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
2161d460d7bfSJunchao Zhang       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
2162d460d7bfSJunchao Zhang     }
2163d460d7bfSJunchao Zhang #endif
2164d460d7bfSJunchao Zhang   }
2165d460d7bfSJunchao Zhang   B->ops->matsolve          = NULL;
2166d460d7bfSJunchao Zhang   B->ops->matsolvetranspose = NULL;
2167d460d7bfSJunchao Zhang 
2168d460d7bfSJunchao Zhang   /* get the triangular factors */
2169d460d7bfSJunchao Zhang   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2170d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2171d460d7bfSJunchao Zhang }
2172d460d7bfSJunchao Zhang 
2173d460d7bfSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2174d460d7bfSJunchao Zhang {
2175d460d7bfSJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr);
2176d460d7bfSJunchao Zhang 
2177d460d7bfSJunchao Zhang   PetscFunctionBegin;
2178d460d7bfSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2179d460d7bfSJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2180d460d7bfSJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2181d460d7bfSJunchao Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2182d460d7bfSJunchao Zhang }
2183d460d7bfSJunchao Zhang 
2184d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2185d71ae5a4SJacob Faibussowitsch {
2186da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2187da112707SJunchao Zhang 
2188da112707SJunchao Zhang   PetscFunctionBegin;
2189b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2190bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2191bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
2192da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
2193da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
2194bc996fdcSJunchao Zhang   }
2195da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2196da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
2197da112707SJunchao Zhang   } else
2198da112707SJunchao Zhang #endif
2199da112707SJunchao Zhang   {
2200da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2201da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2202da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2203da112707SJunchao Zhang   }
22043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2205da112707SJunchao Zhang }
2206da112707SJunchao Zhang 
2207d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2208d71ae5a4SJacob Faibussowitsch {
2209da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2210da112707SJunchao Zhang 
2211da112707SJunchao Zhang   PetscFunctionBegin;
2212b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2213bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2214bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2215da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2216da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2217da112707SJunchao Zhang   } else
2218da112707SJunchao Zhang #endif
2219da112707SJunchao Zhang   {
2220da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2221da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2222da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2223da112707SJunchao Zhang   }
22243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2225da112707SJunchao Zhang }
2226da112707SJunchao Zhang 
2227d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2228d71ae5a4SJacob Faibussowitsch {
2229da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2230da112707SJunchao Zhang 
2231da112707SJunchao Zhang   PetscFunctionBegin;
2232da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2233da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2234da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
22353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2236da112707SJunchao Zhang }
2237da112707SJunchao Zhang 
223866976f2fSJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
2239d71ae5a4SJacob Faibussowitsch {
2240841d4cb1SJunchao Zhang   PetscFunctionBegin;
2241841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
22423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2243841d4cb1SJunchao Zhang }
2244841d4cb1SJunchao Zhang 
2245841d4cb1SJunchao Zhang /*MC
2246841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
224711a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2248841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2249841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
225011a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2251841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2252841d4cb1SJunchao Zhang 
2253841d4cb1SJunchao Zhang   Level: beginner
2254841d4cb1SJunchao Zhang 
22551cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
22562ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2257841d4cb1SJunchao Zhang M*/
2258841d4cb1SJunchao Zhang 
2259d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2260d71ae5a4SJacob Faibussowitsch {
2261841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
2262bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
2263bc996fdcSJunchao Zhang   char     *prefix;
2264bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
2265841d4cb1SJunchao Zhang 
2266841d4cb1SJunchao Zhang   PetscFunctionBegin;
2267841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2268841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2269b820271fSJunchao Zhang   (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors
2270841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2271841d4cb1SJunchao Zhang 
2272bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2273f4f49eeaSPierre Jolivet   PetscOptionsBegin(PetscObjectComm((PetscObject)*B), prefix, "MatGetFactor", "Mat");
2274bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2275bc996fdcSJunchao Zhang   PetscOptionsEnd();
2276bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2277bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2278f4f49eeaSPierre Jolivet   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)*B), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2279bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2280bc996fdcSJunchao Zhang 
2281841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2282841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2283841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2284841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2285841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2286841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2287841d4cb1SJunchao Zhang     } else {
2288841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2289841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2290841d4cb1SJunchao Zhang     }
2291841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2292841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2293841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2294841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2295841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2296841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2297841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2298841d4cb1SJunchao Zhang     } else {
2299841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2300841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2301841d4cb1SJunchao Zhang     }
2302841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2303841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2304841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2305841d4cb1SJunchao Zhang 
2306841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2307841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2308f4f49eeaSPierre Jolivet   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
23093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2310841d4cb1SJunchao Zhang }
2311841d4cb1SJunchao Zhang 
2312d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2313d71ae5a4SJacob Faibussowitsch {
23147e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
23157e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2316b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2317da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
23180dd8c0acSJunchao Zhang #endif
23197e8381f9SStefano Zampini 
23207e8381f9SStefano Zampini   PetscFunctionBegin;
23217e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
23229566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2323da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2324da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
23259566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2326da112707SJunchao Zhang     }
2327b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2328da112707SJunchao Zhang     else if (fs->csrVal) {
2329da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2330da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2331da112707SJunchao Zhang     }
2332da112707SJunchao Zhang #endif
23339371c9d4SSatish Balay     else
23349371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
23359566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
23369566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
23377e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
23387e8381f9SStefano Zampini   }
23393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23407e8381f9SStefano Zampini }
23417e8381f9SStefano Zampini 
2342d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2343d71ae5a4SJacob Faibussowitsch {
23447e8381f9SStefano Zampini   PetscFunctionBegin;
23459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
234667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
234867a45760SJunchao Zhang }
234967a45760SJunchao Zhang 
2350d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2351d71ae5a4SJacob Faibussowitsch {
235267a45760SJunchao Zhang   PetscFunctionBegin;
23537e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
235467a45760SJunchao Zhang   *array         = NULL;
23553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
235667a45760SJunchao Zhang }
235767a45760SJunchao Zhang 
2358d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2359d71ae5a4SJacob Faibussowitsch {
236067a45760SJunchao Zhang   PetscFunctionBegin;
23619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
236267a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
236467a45760SJunchao Zhang }
236567a45760SJunchao Zhang 
23668eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2367d71ae5a4SJacob Faibussowitsch {
236867a45760SJunchao Zhang   PetscFunctionBegin;
236967a45760SJunchao Zhang   *array = NULL;
23703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237167a45760SJunchao Zhang }
237267a45760SJunchao Zhang 
2373d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2374d71ae5a4SJacob Faibussowitsch {
237567a45760SJunchao Zhang   PetscFunctionBegin;
237667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
23773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
237867a45760SJunchao Zhang }
237967a45760SJunchao Zhang 
2380d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2381d71ae5a4SJacob Faibussowitsch {
238267a45760SJunchao Zhang   PetscFunctionBegin;
238367a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
238467a45760SJunchao Zhang   *array         = NULL;
23853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
23867e8381f9SStefano Zampini }
23877e8381f9SStefano Zampini 
2388d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2389d71ae5a4SJacob Faibussowitsch {
23907ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
23917ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
23927ee59b9bSJunchao Zhang 
23937ee59b9bSJunchao Zhang   PetscFunctionBegin;
23947ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23957ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
23967ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
23977ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
23987ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
23997ee59b9bSJunchao Zhang 
24007ee59b9bSJunchao Zhang   if (i) {
24017ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24027ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
24037ee59b9bSJunchao Zhang #else
24047ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24057ee59b9bSJunchao Zhang #endif
24067ee59b9bSJunchao Zhang   }
24077ee59b9bSJunchao Zhang   if (j) {
24087ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
24097ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
24107ee59b9bSJunchao Zhang #else
24117ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
24127ee59b9bSJunchao Zhang #endif
24137ee59b9bSJunchao Zhang   }
24147ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
24157ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
24163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
24177ee59b9bSJunchao Zhang }
24187ee59b9bSJunchao Zhang 
2419d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2420d71ae5a4SJacob Faibussowitsch {
2421aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
24227c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
24239ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2424213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2425aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2426abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
24279ae82921SPaul Mullowney 
24289ae82921SPaul Mullowney   PetscFunctionBegin;
242928b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2430c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2431a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2432a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2433afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
243485ba7357SStefano Zampini 
243508401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
24369566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2437afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
24389566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
2439f4f49eeaSPierre Jolivet       PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar)));
24409566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24419566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
244234d6c7a5SJose E. Roman     } else {
2443abb89eb1SStefano Zampini       PetscInt nnz;
24449566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
24459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
24469566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
24477c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
244881902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2449a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2450a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
24519ae82921SPaul Mullowney       try {
24529ae82921SPaul Mullowney         if (a->compressedrow.use) {
24539ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
24549ae82921SPaul Mullowney           ii   = a->compressedrow.i;
24559ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
24569ae82921SPaul Mullowney         } else {
2457213423ffSJunchao Zhang           m    = A->rmap->n;
2458213423ffSJunchao Zhang           ii   = a->i;
2459e6e9a74fSStefano Zampini           ridx = NULL;
24609ae82921SPaul Mullowney         }
246108401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
24629371c9d4SSatish Balay         if (!a->a) {
24639371c9d4SSatish Balay           nnz  = ii[m];
24649371c9d4SSatish Balay           both = PETSC_FALSE;
24659371c9d4SSatish Balay         } else nnz = a->nz;
246608401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
24679ae82921SPaul Mullowney 
246885ba7357SStefano Zampini         /* create cusparse matrix */
2469abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2470aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
24719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
24729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
24739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
24749ae82921SPaul Mullowney 
2475f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar)));
2476f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar)));
2477f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar)));
24789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24809566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
24819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2482b06137fdSPaul Mullowney 
2483aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2484aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2485aa372e3fSPaul Mullowney           /* set the matrix */
2486afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2487afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2488afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2489abb89eb1SStefano Zampini           mat->num_entries = nnz;
2490ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2491afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
24929ae82921SPaul Mullowney 
2493ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2494abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2495aa372e3fSPaul Mullowney 
2496ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2497abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2498aa372e3fSPaul Mullowney 
2499aa372e3fSPaul Mullowney           /* assign the pointer */
2500afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2501afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2502afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
25039371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25049371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25059371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2506afb2bd1cSJunchao Zhang           }
2507afb2bd1cSJunchao Zhang #endif
2508aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2509afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2510afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2511afb2bd1cSJunchao Zhang #else
2512afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2513afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2514afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2515abb89eb1SStefano Zampini           mat->num_entries = nnz;
2516ee477ddbSJunchao Zhang           PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1));
2517afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2518aa372e3fSPaul Mullowney 
2519ee477ddbSJunchao Zhang           PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz));
2520abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2521aa372e3fSPaul Mullowney 
2522ee477ddbSJunchao Zhang           PetscCallCXX(mat->values = new THRUSTARRAY(nnz));
2523abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2524aa372e3fSPaul Mullowney 
2525aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
25269566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
25279371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
25289371c9d4SSatish Balay           stat                             = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
25299371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2530aa372e3fSPaul Mullowney           /* assign the pointer */
2531aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2532aa372e3fSPaul Mullowney 
2533afb2bd1cSJunchao Zhang           if (mat) {
2534afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2535afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2536afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2537afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2538087f3262SPaul Mullowney           }
2539afb2bd1cSJunchao Zhang #endif
2540087f3262SPaul Mullowney         }
2541ca45077fSPaul Mullowney 
2542aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2543213423ffSJunchao Zhang         if (a->compressedrow.use) {
2544ee477ddbSJunchao Zhang           PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m));
2545ee477ddbSJunchao Zhang           PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m));
2546aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2547213423ffSJunchao Zhang           tmp = m;
2548213423ffSJunchao Zhang         } else {
2549213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2550213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2551213423ffSJunchao Zhang           tmp                        = 0;
2552213423ffSJunchao Zhang         }
25539566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2554aa372e3fSPaul Mullowney 
2555aa372e3fSPaul Mullowney         /* assign the pointer */
2556aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2557d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2558d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2559d71ae5a4SJacob Faibussowitsch       }
25609566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
25619566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
256234d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
256334d6c7a5SJose E. Roman     }
2564abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25659ae82921SPaul Mullowney   }
25663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
25679ae82921SPaul Mullowney }
25689ae82921SPaul Mullowney 
25699371c9d4SSatish Balay struct VecCUDAPlusEquals {
2570aa372e3fSPaul Mullowney   template <typename Tuple>
2571d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2572d71ae5a4SJacob Faibussowitsch   {
2573aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2574aa372e3fSPaul Mullowney   }
2575aa372e3fSPaul Mullowney };
2576aa372e3fSPaul Mullowney 
25779371c9d4SSatish Balay struct VecCUDAEquals {
25787e8381f9SStefano Zampini   template <typename Tuple>
2579d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2580d71ae5a4SJacob Faibussowitsch   {
25817e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
25827e8381f9SStefano Zampini   }
25837e8381f9SStefano Zampini };
25847e8381f9SStefano Zampini 
25859371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2586e6e9a74fSStefano Zampini   template <typename Tuple>
2587d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2588d71ae5a4SJacob Faibussowitsch   {
2589e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2590e6e9a74fSStefano Zampini   }
2591e6e9a74fSStefano Zampini };
2592e6e9a74fSStefano Zampini 
2593afb2bd1cSJunchao Zhang struct MatMatCusparse {
2594ccdfe979SStefano Zampini   PetscBool      cisdense;
2595ccdfe979SStefano Zampini   PetscScalar   *Bt;
2596ccdfe979SStefano Zampini   Mat            X;
2597fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2598fcdce8c4SStefano Zampini   PetscLogDouble flops;
2599fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2600b4285af6SJunchao Zhang 
2601afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2602fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2603afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2604afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2605afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2606afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2607b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2608b4285af6SJunchao Zhang   void *dBuffer4;
2609b4285af6SJunchao Zhang   void *dBuffer5;
2610b4285af6SJunchao Zhang   #endif
2611fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2612fcdce8c4SStefano Zampini   void                 *mmBuffer;
2613fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2614fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2615afb2bd1cSJunchao Zhang #endif
2616afb2bd1cSJunchao Zhang };
2617ccdfe979SStefano Zampini 
2618d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2619d71ae5a4SJacob Faibussowitsch {
2620ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2621ccdfe979SStefano Zampini 
2622ccdfe979SStefano Zampini   PetscFunctionBegin;
26239566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2624fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2625afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
26269566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
26279566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
26289566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
26299566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2630b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26319566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
26329566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2633b4285af6SJunchao Zhang   #endif
26349566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
26359566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2636afb2bd1cSJunchao Zhang #endif
26379566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
26389566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
26393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2640ccdfe979SStefano Zampini }
2641ccdfe979SStefano Zampini 
26424742e46bSJacob Faibussowitsch #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal()
2643ccdfe979SStefano Zampini 
2644d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2645d71ae5a4SJacob Faibussowitsch {
2646ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2647ccdfe979SStefano Zampini   Mat                           A, B;
2648afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2649ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2650ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2651ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2652ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2653ccdfe979SStefano Zampini   const PetscScalar            *barray;
2654ccdfe979SStefano Zampini   PetscScalar                  *carray;
2655ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2656ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2657ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2658ccdfe979SStefano Zampini 
2659ccdfe979SStefano Zampini   PetscFunctionBegin;
2660ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
266128b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2662ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2663ccdfe979SStefano Zampini   A      = product->A;
2664ccdfe979SStefano Zampini   B      = product->B;
26659566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
266628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2667ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2668ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
266928b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
26709566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2671ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2672ccdfe979SStefano Zampini   switch (product->type) {
2673ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2674ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2675ccdfe979SStefano Zampini     mat = cusp->mat;
2676ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2677ccdfe979SStefano Zampini     m   = A->rmap->n;
2678ccdfe979SStefano Zampini     n   = B->cmap->n;
2679ccdfe979SStefano Zampini     break;
2680ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
26811a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2682e6e9a74fSStefano Zampini       mat = cusp->mat;
2683e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2684e6e9a74fSStefano Zampini     } else {
26859566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2686ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2687ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2688e6e9a74fSStefano Zampini     }
2689ccdfe979SStefano Zampini     m = A->cmap->n;
2690ccdfe979SStefano Zampini     n = B->cmap->n;
2691ccdfe979SStefano Zampini     break;
2692ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2693ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2694ccdfe979SStefano Zampini     mat = cusp->mat;
2695ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2696ccdfe979SStefano Zampini     m   = A->rmap->n;
2697ccdfe979SStefano Zampini     n   = B->rmap->n;
2698ccdfe979SStefano Zampini     break;
2699d71ae5a4SJacob Faibussowitsch   default:
2700d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2701ccdfe979SStefano Zampini   }
270228b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2703ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2704ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
27059566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
27069566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2707cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2708afb2bd1cSJunchao Zhang 
27099566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2710c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2711cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
27129566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2713c8378d12SStefano Zampini   } else {
2714cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
27159566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2716c8378d12SStefano Zampini   }
2717c8378d12SStefano Zampini 
27189566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2719afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2720afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2721a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2722afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2723fcdce8c4SStefano Zampini     size_t mmBufferSize;
27249371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
27259371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
27269371c9d4SSatish Balay       mmdata->matBDescr = NULL;
27279371c9d4SSatish Balay     }
2728afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
27299566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2730afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2731afb2bd1cSJunchao Zhang     }
2732c8378d12SStefano Zampini 
27339371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
27349371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
27359371c9d4SSatish Balay       mmdata->matCDescr = NULL;
27369371c9d4SSatish Balay     }
2737afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
27389566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2739afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2740afb2bd1cSJunchao Zhang     }
2741afb2bd1cSJunchao Zhang 
2742afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
27439371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
27449371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
27459371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2746afb2bd1cSJunchao Zhang     }
27479371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
27489371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2749fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
27509566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
27519566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2752fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2753fcdce8c4SStefano Zampini     }
2754afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2755afb2bd1cSJunchao Zhang   } else {
2756afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
27579566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
27589566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
27599566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2760afb2bd1cSJunchao Zhang   }
2761afb2bd1cSJunchao Zhang 
2762afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
27639371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
27649371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2765afb2bd1cSJunchao Zhang #else
2766afb2bd1cSJunchao Zhang   PetscInt k;
2767afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2768ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2769ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2770ccdfe979SStefano Zampini     cublasStatus_t cerr;
2771ccdfe979SStefano Zampini 
27729566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
27739371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
27749371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2775ccdfe979SStefano Zampini     blda = B->cmap->n;
2776afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2777afb2bd1cSJunchao Zhang   } else {
2778afb2bd1cSJunchao Zhang     k = B->rmap->n;
2779ccdfe979SStefano Zampini   }
2780ccdfe979SStefano Zampini 
2781afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
27829371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
27839371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2784afb2bd1cSJunchao Zhang #endif
27859566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
27869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2787cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2788ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2789cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
27904742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2791ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2792cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
27934742e46bSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2794ccdfe979SStefano Zampini   } else {
2795cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2796ccdfe979SStefano Zampini   }
279748a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
279848a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
27993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2800ccdfe979SStefano Zampini }
2801ccdfe979SStefano Zampini 
2802d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2803d71ae5a4SJacob Faibussowitsch {
2804ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2805ccdfe979SStefano Zampini   Mat                 A, B;
2806ccdfe979SStefano Zampini   PetscInt            m, n;
2807ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2808ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2809ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2810ccdfe979SStefano Zampini 
2811ccdfe979SStefano Zampini   PetscFunctionBegin;
2812ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
281328b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2814ccdfe979SStefano Zampini   A = product->A;
2815ccdfe979SStefano Zampini   B = product->B;
28169566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
281728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2818ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
281908401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2820ccdfe979SStefano Zampini   switch (product->type) {
2821ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2822ccdfe979SStefano Zampini     m = A->rmap->n;
2823ccdfe979SStefano Zampini     n = B->cmap->n;
2824ccdfe979SStefano Zampini     break;
2825ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2826ccdfe979SStefano Zampini     m = A->cmap->n;
2827ccdfe979SStefano Zampini     n = B->cmap->n;
2828ccdfe979SStefano Zampini     break;
2829ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2830ccdfe979SStefano Zampini     m = A->rmap->n;
2831ccdfe979SStefano Zampini     n = B->rmap->n;
2832ccdfe979SStefano Zampini     break;
2833ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2834ccdfe979SStefano Zampini     m = B->cmap->n;
2835ccdfe979SStefano Zampini     n = B->cmap->n;
2836ccdfe979SStefano Zampini     break;
2837ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2838ccdfe979SStefano Zampini     m = B->rmap->n;
2839ccdfe979SStefano Zampini     n = B->rmap->n;
2840ccdfe979SStefano Zampini     break;
2841d71ae5a4SJacob Faibussowitsch   default:
2842d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2843ccdfe979SStefano Zampini   }
28449566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2845ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
28469566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
28479566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2848ccdfe979SStefano Zampini 
2849ccdfe979SStefano Zampini   /* product data */
28509566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2851ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2852afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2853afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
285448a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2855afb2bd1cSJunchao Zhang #endif
2856ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2857ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28589566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
28599566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2860ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
28619566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2862ccdfe979SStefano Zampini     } else {
28639566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2864ccdfe979SStefano Zampini     }
2865ccdfe979SStefano Zampini   }
2866ccdfe979SStefano Zampini   C->product->data    = mmdata;
2867ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2868ccdfe979SStefano Zampini 
2869ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
28703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2871ccdfe979SStefano Zampini }
2872ccdfe979SStefano Zampini 
2873d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2874d71ae5a4SJacob Faibussowitsch {
2875ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2876fcdce8c4SStefano Zampini   Mat                           A, B;
2877fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2878fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2879fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2880fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2881fcdce8c4SStefano Zampini   PetscBool                     flg;
2882fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2883fcdce8c4SStefano Zampini   MatProductType                ptype;
2884fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2885fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2886fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2887fcdce8c4SStefano Zampini #endif
2888b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2889ccdfe979SStefano Zampini 
2890ccdfe979SStefano Zampini   PetscFunctionBegin;
2891ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
289228b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
28939566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
289428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2895fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2896fcdce8c4SStefano Zampini   A      = product->A;
2897fcdce8c4SStefano Zampini   B      = product->B;
2898fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2899fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2900fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
290108401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2902fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
290328b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2904fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
290528b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2906fcdce8c4SStefano Zampini     goto finalize;
2907fcdce8c4SStefano Zampini   }
2908fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
29099566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
291028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
29119566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
291228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
291328b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
291428b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2915fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2916fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2917fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
291808401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
291908401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
292008401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
29219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
29229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2923fcdce8c4SStefano Zampini 
2924fcdce8c4SStefano Zampini   ptype = product->type;
2925b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2926fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
292728b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2928fa046f9fSJunchao Zhang   }
2929b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2930fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
293128b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2932fa046f9fSJunchao Zhang   }
2933fcdce8c4SStefano Zampini   switch (ptype) {
2934fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2935fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2936fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2937fcdce8c4SStefano Zampini     break;
2938fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2939fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2940fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2941fcdce8c4SStefano Zampini     break;
2942fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2943fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2944fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2945fcdce8c4SStefano Zampini     break;
2946d71ae5a4SJacob Faibussowitsch   default:
2947d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2948fcdce8c4SStefano Zampini   }
2949fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
295028b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
295128b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
295228b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2953fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2954fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2955fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
295628b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
295728b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
295828b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
29599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2960fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2961fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
29629566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2963b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
29649371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29659371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2966b4285af6SJunchao Zhang   #else
29679371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29689371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29699371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29709371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2971b4285af6SJunchao Zhang   #endif
2972fcdce8c4SStefano Zampini #else
29739371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29749371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
29759371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2976fcdce8c4SStefano Zampini #endif
29779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
29789566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
29799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2980fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2981fcdce8c4SStefano Zampini finalize:
2982fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
29839566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
29849566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
29859566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2986fcdce8c4SStefano Zampini   c->reallocs = 0;
2987fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2988fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2989fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2990fcdce8c4SStefano Zampini   C->num_ass++;
29913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2992ccdfe979SStefano Zampini }
2993fcdce8c4SStefano Zampini 
2994d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2995d71ae5a4SJacob Faibussowitsch {
2996fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2997fcdce8c4SStefano Zampini   Mat                           A, B;
2998fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2999fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
3000fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
3001fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
3002fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
3003fcdce8c4SStefano Zampini   PetscBool                     flg;
3004fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
3005fcdce8c4SStefano Zampini   MatProductType                ptype;
3006fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3007fcdce8c4SStefano Zampini   PetscLogDouble                flops;
3008fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
3009fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3010fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
3011fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
3012fcdce8c4SStefano Zampini #else
3013fcdce8c4SStefano Zampini   int cnz;
3014fcdce8c4SStefano Zampini #endif
3015b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3016fcdce8c4SStefano Zampini 
3017fcdce8c4SStefano Zampini   PetscFunctionBegin;
3018fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
301928b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
3020fcdce8c4SStefano Zampini   A = product->A;
3021fcdce8c4SStefano Zampini   B = product->B;
30229566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
302328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
30249566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
302528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
3026fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
3027fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
3028fcdce8c4SStefano Zampini   /* product data */
30299566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3030fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3031fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3032fcdce8c4SStefano Zampini 
30339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
30349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3035d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3036d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
303708401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
303808401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
3039d60bce21SJunchao Zhang 
3040fcdce8c4SStefano Zampini   ptype = product->type;
3041b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
3042fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3043fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3044fa046f9fSJunchao Zhang   }
3045b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
3046fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
3047fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3048fa046f9fSJunchao Zhang   }
3049fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3050fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3051fcdce8c4SStefano Zampini   switch (ptype) {
3052fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3053fcdce8c4SStefano Zampini     m    = A->rmap->n;
3054fcdce8c4SStefano Zampini     n    = B->cmap->n;
3055fcdce8c4SStefano Zampini     k    = A->cmap->n;
3056fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3057fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3058fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3059fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3060fcdce8c4SStefano Zampini     break;
3061fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3062fcdce8c4SStefano Zampini     m = A->cmap->n;
3063fcdce8c4SStefano Zampini     n = B->cmap->n;
3064fcdce8c4SStefano Zampini     k = A->rmap->n;
30659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3066fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3067fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3068fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3069fcdce8c4SStefano Zampini     break;
3070fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3071fcdce8c4SStefano Zampini     m = A->rmap->n;
3072fcdce8c4SStefano Zampini     n = B->rmap->n;
3073fcdce8c4SStefano Zampini     k = A->cmap->n;
30749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3075fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3076fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3077fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3078fcdce8c4SStefano Zampini     break;
3079d71ae5a4SJacob Faibussowitsch   default:
3080d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
3081fcdce8c4SStefano Zampini   }
3082fcdce8c4SStefano Zampini 
3083fcdce8c4SStefano Zampini   /* create cusparse matrix */
30849566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
30859566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
3086fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
3087fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
3088fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3089fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3090fcdce8c4SStefano Zampini 
3091fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3092fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3093fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
30949566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
30959566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
3096fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3097fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3098fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
3099fcdce8c4SStefano Zampini   } else {
3100fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3101fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3102fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3103fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3104fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3105fcdce8c4SStefano Zampini   }
3106fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
3107fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
3108fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
3109fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3110fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3111fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
31129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
31139566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
31149566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
3115f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
3116f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
3117f4f49eeaSPierre Jolivet   PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
31189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31199566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
31209566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
3121fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3122d460d7bfSJunchao Zhang     PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0));
3123fcdce8c4SStefano Zampini     c->nz                = 0;
3124fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3125fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
3126fcdce8c4SStefano Zampini     goto finalizesym;
3127fcdce8c4SStefano Zampini   }
3128fcdce8c4SStefano Zampini 
312928b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
313028b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
3131fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
3132fcdce8c4SStefano Zampini   if (!biscompressed) {
3133fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
3134fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3135fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3136fcdce8c4SStefano Zampini #endif
3137fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3138fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
3139fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
3140fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3141fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3142fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3143fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3144fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3145fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3146fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
3147fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
31489566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
3149fcdce8c4SStefano Zampini     }
3150fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3151fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
3152fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3153fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
31549371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31559371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
3156fcdce8c4SStefano Zampini     }
3157fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3158fcdce8c4SStefano Zampini #endif
3159fcdce8c4SStefano Zampini   }
316028b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
316128b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
3162fcdce8c4SStefano Zampini   /* precompute flops count */
3163fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3164fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3165fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3166fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
3167fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
3168fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3169fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
3170fcdce8c4SStefano Zampini       }
3171fcdce8c4SStefano Zampini     }
3172fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3173fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
3174fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
3175fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
3176fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
3177fcdce8c4SStefano Zampini     }
3178fcdce8c4SStefano Zampini   } else { /* TODO */
3179fcdce8c4SStefano Zampini     flops = 0.;
3180fcdce8c4SStefano Zampini   }
3181fcdce8c4SStefano Zampini 
3182fcdce8c4SStefano Zampini   mmdata->flops = flops;
31839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3184b4285af6SJunchao Zhang 
3185fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
31869566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
31871ffab3bdSJunchao Zhang   // cuda-12.2 requires non-null csrRowOffsets
31881ffab3bdSJunchao Zhang   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
31899371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
31909566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3191b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
3192b4285af6SJunchao Zhang   {
3193b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3194b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3195b4285af6SJunchao Zhang   */
3196b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3197b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3198b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3199b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3200b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3201b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3202b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3203b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3204b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3205b4285af6SJunchao Zhang 
3206b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
32079371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
32089371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32099566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3210b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
32119371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
32129371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3213b4285af6SJunchao Zhang 
32149371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
32159371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
32179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
32189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
32199371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
32209371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
32229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3223b4285af6SJunchao Zhang 
3224b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
32259566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3226b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3227b4285af6SJunchao Zhang     /* allocate matrix C */
32289371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32299371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32309371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
32319371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3232b4285af6SJunchao Zhang     /* update matC with the new pointers */
32339371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32349371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3235b4285af6SJunchao Zhang 
32369371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
32379371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
32399371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
32409371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
32429371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32439371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
32449566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3245b4285af6SJunchao Zhang   }
3246ae37ee31SJunchao Zhang   #else
3247b4285af6SJunchao Zhang   size_t bufSize2;
3248fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
32499371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
32509371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32519566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3252fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
32539371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
32549371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3255fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
32569371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
32579371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3258fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3259fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3260fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3261fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3262fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
32639566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3264fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
32659371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
32669371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3267fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
32689566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3269fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
32709371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
32719371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3272fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32739566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3274fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
32759566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
32769371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
32779371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
32789371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
32799371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3280ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3281fcdce8c4SStefano Zampini #else
32829566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
32839371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
32849371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
32859371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3286fcdce8c4SStefano Zampini   c->nz                = cnz;
3287fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
32889566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3289fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
32909566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3291fcdce8c4SStefano Zampini 
32929566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3293fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3294fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3295fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
32969371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
32979371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
32989371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3299fcdce8c4SStefano Zampini #endif
33009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
33019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3302fcdce8c4SStefano Zampini finalizesym:
3303fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3304fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3305fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
33069566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
33079566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
33087de69702SBarry Smith   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
3309fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3310fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3311fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3312fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3313fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3314fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3317fcdce8c4SStefano Zampini   } else {
3318fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3319fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
33209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
33219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3322fcdce8c4SStefano Zampini   }
3323fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3324fcdce8c4SStefano Zampini     PetscInt r = 0;
3325fcdce8c4SStefano Zampini     c->i[0]    = 0;
3326fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3327fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3328fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3329fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3330fcdce8c4SStefano Zampini     }
3331fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3332fcdce8c4SStefano Zampini   }
33339566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
33349566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
33359566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3336fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3337fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3338fcdce8c4SStefano Zampini   c->rmax          = 0;
3339fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3340fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3341fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3342fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3343fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3344fcdce8c4SStefano Zampini   }
33459566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
33469566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3347fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3348fcdce8c4SStefano Zampini 
3349fcdce8c4SStefano Zampini   C->nonzerostate++;
33509566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
33519566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3352fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3353fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3354fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3355fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3356fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3357abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3358fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3359fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3360fcdce8c4SStefano Zampini   }
3361fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
33623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3363fcdce8c4SStefano Zampini }
3364fcdce8c4SStefano Zampini 
3365fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3366fcdce8c4SStefano Zampini 
3367fcdce8c4SStefano Zampini /* handles sparse or dense B */
3368d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3369d71ae5a4SJacob Faibussowitsch {
3370fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3371fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3372fcdce8c4SStefano Zampini 
3373fcdce8c4SStefano Zampini   PetscFunctionBegin;
3374fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
33759566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
337648a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3377fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3378fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
337948a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3380fcdce8c4SStefano Zampini   }
338165e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
338265e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
338365e4b4d4SStefano Zampini     switch (product->type) {
338465e4b4d4SStefano Zampini     case MATPRODUCT_AB:
338565e4b4d4SStefano Zampini       if (product->api_user) {
3386d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
33879566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3388d0609cedSBarry Smith         PetscOptionsEnd();
338965e4b4d4SStefano Zampini       } else {
3390d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
33919566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3392d0609cedSBarry Smith         PetscOptionsEnd();
339365e4b4d4SStefano Zampini       }
339465e4b4d4SStefano Zampini       break;
339565e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
339665e4b4d4SStefano Zampini       if (product->api_user) {
3397d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
33989566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3399d0609cedSBarry Smith         PetscOptionsEnd();
340065e4b4d4SStefano Zampini       } else {
3401d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
34029566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3403d0609cedSBarry Smith         PetscOptionsEnd();
340465e4b4d4SStefano Zampini       }
340565e4b4d4SStefano Zampini       break;
340665e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
340765e4b4d4SStefano Zampini       if (product->api_user) {
3408d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
34099566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3410d0609cedSBarry Smith         PetscOptionsEnd();
341165e4b4d4SStefano Zampini       } else {
3412d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
34139566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3414d0609cedSBarry Smith         PetscOptionsEnd();
341565e4b4d4SStefano Zampini       }
341665e4b4d4SStefano Zampini       break;
341765e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
341865e4b4d4SStefano Zampini       if (product->api_user) {
3419d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
34209566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3421d0609cedSBarry Smith         PetscOptionsEnd();
342265e4b4d4SStefano Zampini       } else {
3423d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
34249566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3425d0609cedSBarry Smith         PetscOptionsEnd();
342665e4b4d4SStefano Zampini       }
342765e4b4d4SStefano Zampini       break;
342865e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
342965e4b4d4SStefano Zampini       if (product->api_user) {
3430d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
34319566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3432d0609cedSBarry Smith         PetscOptionsEnd();
343365e4b4d4SStefano Zampini       } else {
3434d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
34359566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3436d0609cedSBarry Smith         PetscOptionsEnd();
343765e4b4d4SStefano Zampini       }
343865e4b4d4SStefano Zampini       break;
3439d71ae5a4SJacob Faibussowitsch     default:
3440d71ae5a4SJacob Faibussowitsch       break;
344165e4b4d4SStefano Zampini     }
344265e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
344365e4b4d4SStefano Zampini   }
344465e4b4d4SStefano Zampini   /* dispatch */
3445fcdce8c4SStefano Zampini   if (isdense) {
3446ccdfe979SStefano Zampini     switch (product->type) {
3447ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3448ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3449ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3450ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3451ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3452fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
34539566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3454fcdce8c4SStefano Zampini       } else {
3455fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3456fcdce8c4SStefano Zampini       }
3457fcdce8c4SStefano Zampini       break;
3458d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3459d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3460d71ae5a4SJacob Faibussowitsch       break;
3461d71ae5a4SJacob Faibussowitsch     default:
3462d71ae5a4SJacob Faibussowitsch       break;
3463ccdfe979SStefano Zampini     }
3464fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3465fcdce8c4SStefano Zampini     switch (product->type) {
3466fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3467fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3468d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3469d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3470d71ae5a4SJacob Faibussowitsch       break;
3471fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3472fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3473d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3474d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3475d71ae5a4SJacob Faibussowitsch       break;
3476d71ae5a4SJacob Faibussowitsch     default:
3477d71ae5a4SJacob Faibussowitsch       break;
3478fcdce8c4SStefano Zampini     }
3479fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
34809566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3481fcdce8c4SStefano Zampini   }
34823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3483ccdfe979SStefano Zampini }
3484ccdfe979SStefano Zampini 
3485d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3486d71ae5a4SJacob Faibussowitsch {
34879ae82921SPaul Mullowney   PetscFunctionBegin;
34889566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
34893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3490e6e9a74fSStefano Zampini }
3491e6e9a74fSStefano Zampini 
3492d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3493d71ae5a4SJacob Faibussowitsch {
3494e6e9a74fSStefano Zampini   PetscFunctionBegin;
34959566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
34963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3497e6e9a74fSStefano Zampini }
3498e6e9a74fSStefano Zampini 
3499d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3500d71ae5a4SJacob Faibussowitsch {
3501e6e9a74fSStefano Zampini   PetscFunctionBegin;
35029566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
35033ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3504e6e9a74fSStefano Zampini }
3505e6e9a74fSStefano Zampini 
3506d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3507d71ae5a4SJacob Faibussowitsch {
3508e6e9a74fSStefano Zampini   PetscFunctionBegin;
35099566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
35103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35119ae82921SPaul Mullowney }
35129ae82921SPaul Mullowney 
3513d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3514d71ae5a4SJacob Faibussowitsch {
3515ca45077fSPaul Mullowney   PetscFunctionBegin;
35169566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
35173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3518ca45077fSPaul Mullowney }
3519ca45077fSPaul Mullowney 
3520d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3521d71ae5a4SJacob Faibussowitsch {
3522a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3523a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3524a0e72f99SJunchao Zhang }
3525a0e72f99SJunchao Zhang 
3526afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3527d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3528d71ae5a4SJacob Faibussowitsch {
35299ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3530aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
35319ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3532e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3533e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3534e6e9a74fSStefano Zampini   PetscBool                     compressed;
3535afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3536afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3537afb2bd1cSJunchao Zhang #endif
35386e111a19SKarl Rupp 
35399ae82921SPaul Mullowney   PetscFunctionBegin;
354008401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3541cbc6b225SStefano Zampini   if (!a->nz) {
3542995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3543995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
35443ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3545e6e9a74fSStefano Zampini   }
354634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
35479566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3548e6e9a74fSStefano Zampini   if (!trans) {
35499ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
35505f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3551e6e9a74fSStefano Zampini   } else {
35521a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3553e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3554e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3555e6e9a74fSStefano Zampini     } else {
35569566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3557e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3558e6e9a74fSStefano Zampini     }
3559e6e9a74fSStefano Zampini   }
3560e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3561e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3562213423ffSJunchao Zhang 
3563e6e9a74fSStefano Zampini   try {
35649566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
356569d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
35669566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3567afb2bd1cSJunchao Zhang 
35689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3569e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3570afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3571afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3572afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3573afb2bd1cSJunchao Zhang       */
3574e6e9a74fSStefano Zampini       xptr = xarray;
3575afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3576213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3577afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3578afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3579afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3580afb2bd1cSJunchao Zhang        */
3581afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3582afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3583afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3584afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3585afb2bd1cSJunchao Zhang       }
3586afb2bd1cSJunchao Zhang #endif
3587e6e9a74fSStefano Zampini     } else {
3588afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3589afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3590afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3591afb2bd1cSJunchao Zhang        */
3592afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3593e6e9a74fSStefano Zampini       dptr = zarray;
3594e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3595afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3596e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3597d0967f54SJacob Faibussowitsch 
3598d0967f54SJacob Faibussowitsch         thrust::for_each(
3599d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3600d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3601d0967f54SJacob Faibussowitsch #endif
3602d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
36039371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3604e6e9a74fSStefano Zampini       }
3605afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3606afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3607afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3608afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3609afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3610afb2bd1cSJunchao Zhang       }
3611afb2bd1cSJunchao Zhang #endif
3612e6e9a74fSStefano Zampini     }
36139ae82921SPaul Mullowney 
3614afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3615aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3616afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
36175f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3618afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
36199566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
36209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
36219371c9d4SSatish Balay         PetscCallCUSPARSE(
36229371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
36239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3624afb2bd1cSJunchao Zhang 
3625afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3626afb2bd1cSJunchao Zhang       } else {
3627afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
36289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
36299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3630afb2bd1cSJunchao Zhang       }
3631afb2bd1cSJunchao Zhang 
36329371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
36339371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3634afb2bd1cSJunchao Zhang #else
36357656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
36369371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3637afb2bd1cSJunchao Zhang #endif
3638aa372e3fSPaul Mullowney     } else {
3639213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3640afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3641afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3642afb2bd1cSJunchao Zhang #else
3643301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
36449371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3645afb2bd1cSJunchao Zhang #endif
3646a65300a6SPaul Mullowney       }
3647aa372e3fSPaul Mullowney     }
36489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3649aa372e3fSPaul Mullowney 
3650e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3651213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3652213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3653995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3654e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3655995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
36567656d835SStefano Zampini         }
3657213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3658995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
36597656d835SStefano Zampini       }
36607656d835SStefano Zampini 
3661213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3662213423ffSJunchao Zhang       if (compressed) {
36639566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3664da81f932SPierre Jolivet         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3665a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3666a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3667a0e72f99SJunchao Zhang          */
3668a0e72f99SJunchao Zhang #if 0
3669a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3670a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3671a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3672e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3673c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3674a0e72f99SJunchao Zhang #else
3675a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3676a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3677a0e72f99SJunchao Zhang #endif
36789566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3679e6e9a74fSStefano Zampini       }
3680e6e9a74fSStefano Zampini     } else {
3681995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3682e6e9a74fSStefano Zampini     }
36839566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
36849566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
36859566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3686d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3687d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3688d71ae5a4SJacob Faibussowitsch   }
3689e6e9a74fSStefano Zampini   if (yy) {
36909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3691e6e9a74fSStefano Zampini   } else {
36929566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3693e6e9a74fSStefano Zampini   }
36943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
36959ae82921SPaul Mullowney }
36969ae82921SPaul Mullowney 
3697d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3698d71ae5a4SJacob Faibussowitsch {
3699ca45077fSPaul Mullowney   PetscFunctionBegin;
37009566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
37013ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3702ca45077fSPaul Mullowney }
3703ca45077fSPaul Mullowney 
3704d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3705d71ae5a4SJacob Faibussowitsch {
3706042217e8SBarry Smith   PetscFunctionBegin;
37079566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
37083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37099ae82921SPaul Mullowney }
37109ae82921SPaul Mullowney 
3711e057df02SPaul Mullowney /*@
371211a5261eSBarry Smith   MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
37132920cce0SJacob Faibussowitsch   (the default parallel PETSc format).
37149ae82921SPaul Mullowney 
3715d083f849SBarry Smith   Collective
37169ae82921SPaul Mullowney 
37179ae82921SPaul Mullowney   Input Parameters:
371811a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
37199ae82921SPaul Mullowney . m    - number of rows
37209ae82921SPaul Mullowney . n    - number of columns
372120f4b53cSBarry Smith . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provide
372220f4b53cSBarry Smith - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
37239ae82921SPaul Mullowney 
37249ae82921SPaul Mullowney   Output Parameter:
37259ae82921SPaul Mullowney . A - the matrix
37269ae82921SPaul Mullowney 
37272ef1f0ffSBarry Smith   Level: intermediate
37282ef1f0ffSBarry Smith 
37292ef1f0ffSBarry Smith   Notes:
37302920cce0SJacob Faibussowitsch   This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for
37312920cce0SJacob Faibussowitsch   calculations. For good matrix assembly performance the user should preallocate the matrix
37322920cce0SJacob Faibussowitsch   storage by setting the parameter `nz` (or the array `nnz`).
37332920cce0SJacob Faibussowitsch 
373411a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
37359ae82921SPaul Mullowney   MatXXXXSetPreallocation() paradgm instead of this routine directly.
373611a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
37379ae82921SPaul Mullowney 
373811a5261eSBarry Smith   The AIJ format, also called
37392ef1f0ffSBarry Smith   compressed row storage, is fully compatible with standard Fortran
37409ae82921SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
374120f4b53cSBarry Smith   either one (as in Fortran) or zero.
37429ae82921SPaul Mullowney 
37439ae82921SPaul Mullowney   Specify the preallocated storage with either nz or nnz (not both).
37442ef1f0ffSBarry Smith   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
374520f4b53cSBarry Smith   allocation.
37469ae82921SPaul Mullowney 
3747fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`
37489ae82921SPaul Mullowney @*/
3749d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3750d71ae5a4SJacob Faibussowitsch {
37519ae82921SPaul Mullowney   PetscFunctionBegin;
37529566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
37539566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
37549566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
37559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
37563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37579ae82921SPaul Mullowney }
37589ae82921SPaul Mullowney 
3759d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3760d71ae5a4SJacob Faibussowitsch {
37619ae82921SPaul Mullowney   PetscFunctionBegin;
37629ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
37632c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSE_Destroy(A));
37649ae82921SPaul Mullowney   } else {
37659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3766aa372e3fSPaul Mullowney   }
37679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
37689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
37699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
37709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
37719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
37729566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
37739566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
37749566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
37759566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
37769566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
37779566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
37783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37799ae82921SPaul Mullowney }
37809ae82921SPaul Mullowney 
3781ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
378295639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3783d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3784d71ae5a4SJacob Faibussowitsch {
37859ff858a8SKarl Rupp   PetscFunctionBegin;
37869566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
37879566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
37883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37899ff858a8SKarl Rupp }
37909ff858a8SKarl Rupp 
3791d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3792d71ae5a4SJacob Faibussowitsch {
3793a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3794039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3795039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3796039c6fbaSStefano Zampini   PetscScalar        *ay;
3797039c6fbaSStefano Zampini   const PetscScalar  *ax;
3798039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3799e6e9a74fSStefano Zampini 
380095639643SRichard Tran Mills   PetscFunctionBegin;
3801a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3802a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3803039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
38049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38059566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
38063ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
380795639643SRichard Tran Mills   }
3808039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
38099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
38109566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
38115f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
38125f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3813039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3814039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3815039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3816039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3817039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3818ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3819039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3820039c6fbaSStefano Zampini   }
3821d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3822d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3823039c6fbaSStefano Zampini 
3824039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3825039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3826039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3827039c6fbaSStefano Zampini     size_t bufferSize;
3828039c6fbaSStefano Zampini     void  *buffer;
3829039c6fbaSStefano Zampini #endif
3830039c6fbaSStefano Zampini 
38319566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38329566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38339566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3834039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38359371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38369371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
38379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
38389566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38399371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38409371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
38419566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38429566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
38439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3844039c6fbaSStefano Zampini #else
38459566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38469371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
38479371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
38489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
38499566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3850039c6fbaSStefano Zampini #endif
38519566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
38529566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
38539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3855039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3856a587d139SMark     cublasHandle_t cublasv2handle;
3857a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3858039c6fbaSStefano Zampini 
38599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
38609566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38619566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
38629566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
38639566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
38649566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
38659566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
38669566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
38679566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
38689566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38699566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3870039c6fbaSStefano Zampini   } else {
38719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
38729566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3873a587d139SMark   }
38743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
387595639643SRichard Tran Mills }
387695639643SRichard Tran Mills 
3877d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3878d71ae5a4SJacob Faibussowitsch {
387933c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
388033c9ba73SStefano Zampini   PetscScalar   *ay;
388133c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
388233c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
388333c9ba73SStefano Zampini 
388433c9ba73SStefano Zampini   PetscFunctionBegin;
38859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
38869566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
38879566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
38889566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
38899566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
38909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
38919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
38929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
38939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
38943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
389533c9ba73SStefano Zampini }
389633c9ba73SStefano Zampini 
3897d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3898d71ae5a4SJacob Faibussowitsch {
38997e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3900a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
39017e8381f9SStefano Zampini 
39023fa6b06aSMark Adams   PetscFunctionBegin;
39033fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
39043fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
39057e8381f9SStefano Zampini     if (spptr->mat) {
39067e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
39077e8381f9SStefano Zampini       if (matrix->values) {
39087e8381f9SStefano Zampini         both = PETSC_TRUE;
39097e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39107e8381f9SStefano Zampini       }
39117e8381f9SStefano Zampini     }
39127e8381f9SStefano Zampini     if (spptr->matTranspose) {
39137e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3914ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
39157e8381f9SStefano Zampini     }
39163fa6b06aSMark Adams   }
39179566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
39189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
39197e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3920a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
39213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39223fa6b06aSMark Adams }
39233fa6b06aSMark Adams 
3924d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3925d71ae5a4SJacob Faibussowitsch {
3926a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3927a587d139SMark 
3928a587d139SMark   PetscFunctionBegin;
39299a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
39309a14fc28SStefano Zampini     A->boundtocpu = flg;
39313ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
39329a14fc28SStefano Zampini   }
3933a587d139SMark   if (flg) {
39349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3935a587d139SMark 
393633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3937a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3938a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3939a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3940a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3941a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3942a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3943a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3944a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3945fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
39469566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
39479566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
39489566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
39499566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
39509566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
39519566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
39529566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3953a587d139SMark   } else {
395433c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3955a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3956a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3957a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3958a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3959a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3960a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3961a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3962a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3963fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
396467a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
396567a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
396667a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
396767a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
396867a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
396967a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
39707ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
39717ee59b9bSJunchao Zhang 
39729566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
39739566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
39749566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
39759566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
39769566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
39779566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3978a587d139SMark   }
3979a587d139SMark   A->boundtocpu = flg;
3980ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3981ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3982ea500dcfSRichard Tran Mills   } else {
3983ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3984ea500dcfSRichard Tran Mills   }
39853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3986a587d139SMark }
3987a587d139SMark 
39888eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3989d71ae5a4SJacob Faibussowitsch {
399049735bf3SStefano Zampini   Mat B;
39919ae82921SPaul Mullowney 
39929ae82921SPaul Mullowney   PetscFunctionBegin;
39939566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
399449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
39959566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
399649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
39979566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
399849735bf3SStefano Zampini   }
399949735bf3SStefano Zampini   B = *newmat;
400049735bf3SStefano Zampini 
40019566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
40029566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
400334136279SStefano Zampini 
400449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
40059ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4006e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
40079566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40089566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40099566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
40101a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
4011d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4012b917901dSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4013a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4014a435da06SStefano Zampini   #else
4015d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
4016a435da06SStefano Zampini   #endif
4017d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4018d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4019d8132acaSStefano Zampini #endif
40201a2c6b5cSJunchao Zhang       B->spptr = spptr;
40219ae82921SPaul Mullowney     } else {
4022e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4023e6e9a74fSStefano Zampini 
40249566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
40259566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
40269566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
4027e6e9a74fSStefano Zampini       B->spptr = spptr;
40289ae82921SPaul Mullowney     }
4029e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
403049735bf3SStefano Zampini   }
4031693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
40329ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
40331a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
40349ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
403595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4036693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
40372205254eSKarl Rupp 
40389566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
40399566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
40409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4041ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
40429566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
4043ae48a8d0SStefano Zampini #endif
40449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
40453ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40469ae82921SPaul Mullowney }
40479ae82921SPaul Mullowney 
4048d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
4049d71ae5a4SJacob Faibussowitsch {
405002fe1965SBarry Smith   PetscFunctionBegin;
40519566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
40529566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
40533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
405402fe1965SBarry Smith }
405502fe1965SBarry Smith 
40563ca39a21SBarry Smith /*MC
4057e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4058e057df02SPaul Mullowney 
405915229ffcSPierre Jolivet    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
406011a5261eSBarry Smith    CSR, ELL, or Hybrid format.
406111a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
4062e057df02SPaul Mullowney 
4063e057df02SPaul Mullowney    Options Database Keys:
406411a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
40652ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
40662ef1f0ffSBarry Smith                                       Other options include ell (ellpack) or hyb (hybrid).
40672ef1f0ffSBarry Smith .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
40682ef1f0ffSBarry Smith -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
4069e057df02SPaul Mullowney 
4070e057df02SPaul Mullowney   Level: beginner
4071e057df02SPaul Mullowney 
40721cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4073e057df02SPaul Mullowney M*/
40747f756511SDominic Meiser 
4075d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
4076d71ae5a4SJacob Faibussowitsch {
407742c9c57cSBarry Smith   PetscFunctionBegin;
40789566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
40799566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
40809566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
40819566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
40823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
408342c9c57cSBarry Smith }
408429b38603SBarry Smith 
40852c4ab24aSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat)
4086d71ae5a4SJacob Faibussowitsch {
40872c4ab24aSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
4088cbc6b225SStefano Zampini 
4089cbc6b225SStefano Zampini   PetscFunctionBegin;
40902c4ab24aSJunchao Zhang   if (cusp) {
40912c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format));
40922c4ab24aSJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
40932c4ab24aSJunchao Zhang     delete cusp->workVector;
40942c4ab24aSJunchao Zhang     delete cusp->rowoffsets_gpu;
40952c4ab24aSJunchao Zhang     delete cusp->csr2csc_i;
40962c4ab24aSJunchao Zhang     delete cusp->coords;
40972c4ab24aSJunchao Zhang     if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle));
40982c4ab24aSJunchao Zhang     PetscCall(PetscFree(mat->spptr));
40997f756511SDominic Meiser   }
41003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41017f756511SDominic Meiser }
41027f756511SDominic Meiser 
4103d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
4104d71ae5a4SJacob Faibussowitsch {
41057f756511SDominic Meiser   PetscFunctionBegin;
41067f756511SDominic Meiser   if (*mat) {
41077f756511SDominic Meiser     delete (*mat)->values;
41087f756511SDominic Meiser     delete (*mat)->column_indices;
41097f756511SDominic Meiser     delete (*mat)->row_offsets;
41107f756511SDominic Meiser     delete *mat;
41117f756511SDominic Meiser     *mat = 0;
41127f756511SDominic Meiser   }
41133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41147f756511SDominic Meiser }
41157f756511SDominic Meiser 
4116b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4117d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
4118d71ae5a4SJacob Faibussowitsch {
41197f756511SDominic Meiser   PetscFunctionBegin;
41207f756511SDominic Meiser   if (*trifactor) {
41219566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4122261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
41239566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
41249566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
41259566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4126afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
41279566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4128afb2bd1cSJunchao Zhang   #endif
41299566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
41307f756511SDominic Meiser   }
41313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41327f756511SDominic Meiser }
4133d460d7bfSJunchao Zhang #endif
41347f756511SDominic Meiser 
4135d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
4136d71ae5a4SJacob Faibussowitsch {
41377f756511SDominic Meiser   CsrMatrix *mat;
41387f756511SDominic Meiser 
41397f756511SDominic Meiser   PetscFunctionBegin;
41407f756511SDominic Meiser   if (*matstruct) {
41417f756511SDominic Meiser     if ((*matstruct)->mat) {
41427f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
4143afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4144afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4145afb2bd1cSJunchao Zhang #else
41467f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
41479566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4148afb2bd1cSJunchao Zhang #endif
41497f756511SDominic Meiser       } else {
41507f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
41513ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
41527f756511SDominic Meiser       }
41537f756511SDominic Meiser     }
41549566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
41557f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
41569566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
41579566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
41589566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4159afb2bd1cSJunchao Zhang 
4160afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4161afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
41629566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4163afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4164afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
41659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
41669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
41679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4168afb2bd1cSJunchao Zhang       }
4169afb2bd1cSJunchao Zhang     }
4170afb2bd1cSJunchao Zhang #endif
41717f756511SDominic Meiser     delete *matstruct;
41727e8381f9SStefano Zampini     *matstruct = NULL;
41737f756511SDominic Meiser   }
41743ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41757f756511SDominic Meiser }
41767f756511SDominic Meiser 
4177d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4178d71ae5a4SJacob Faibussowitsch {
4179da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4180da112707SJunchao Zhang 
41817f756511SDominic Meiser   PetscFunctionBegin;
4182da112707SJunchao Zhang   if (fs) {
4183b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
4184da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4185da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4186da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4187da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4188d460d7bfSJunchao Zhang     delete fs->workVector;
4189d460d7bfSJunchao Zhang     fs->workVector = NULL;
4190d460d7bfSJunchao Zhang #endif
4191da112707SJunchao Zhang     delete fs->rpermIndices;
4192da112707SJunchao Zhang     delete fs->cpermIndices;
4193da112707SJunchao Zhang     fs->rpermIndices  = NULL;
4194da112707SJunchao Zhang     fs->cpermIndices  = NULL;
4195da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4196b917901dSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
4197da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4198da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
419930807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr32));
420030807b38SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx32));
4201da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4202d460d7bfSJunchao Zhang     PetscCallCUDA(cudaFree(fs->diag));
4203da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4204da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
420512ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4206da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4207da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
420812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4209da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4210da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4211da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4212da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4213da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4214da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4215da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4216da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4217da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4218da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4219da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4220da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
4221d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrRowPtr_h));
4222d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->csrVal_h));
4223d460d7bfSJunchao Zhang     PetscCall(PetscFree(fs->diag_h));
422412ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
422512ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4226da112707SJunchao Zhang #endif
4227ccdfe979SStefano Zampini   }
42283ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4229ccdfe979SStefano Zampini }
4230ccdfe979SStefano Zampini 
4231d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4232d71ae5a4SJacob Faibussowitsch {
4233ccdfe979SStefano Zampini   PetscFunctionBegin;
4234ccdfe979SStefano Zampini   if (*trifactors) {
42359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
4236f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
42379566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
42387f756511SDominic Meiser   }
42393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42407f756511SDominic Meiser }
42417e8381f9SStefano Zampini 
42429371c9d4SSatish Balay struct IJCompare {
4243d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4244d71ae5a4SJacob Faibussowitsch   {
42450b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
42460b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
42477e8381f9SStefano Zampini     return false;
42487e8381f9SStefano Zampini   }
42497e8381f9SStefano Zampini };
42507e8381f9SStefano Zampini 
425166976f2fSJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4252d71ae5a4SJacob Faibussowitsch {
4253a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4254a49f1ed0SStefano Zampini 
4255a49f1ed0SStefano Zampini   PetscFunctionBegin;
4256a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
42573ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4258a49f1ed0SStefano Zampini   if (destroy) {
42599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4260a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4261a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4262a49f1ed0SStefano Zampini   }
42631a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
42643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4265a49f1ed0SStefano Zampini }
4266a49f1ed0SStefano Zampini 
42672c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void *data)
4268d71ae5a4SJacob Faibussowitsch {
42692c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)data;
42704d86920dSPierre Jolivet 
42717e8381f9SStefano Zampini   PetscFunctionBegin;
42722c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->perm));
42732c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->jmap));
42742c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
42753ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42767e8381f9SStefano Zampini }
4277ed502f03SStefano Zampini 
427866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4279d71ae5a4SJacob Faibussowitsch {
42802c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
42812c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
42822c4ab24aSJunchao Zhang   PetscInt            *i, *j;
42832c4ab24aSJunchao Zhang   PetscContainer       container_h, container_d;
42842c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo_h, *coo_d;
4285219fbbafSJunchao Zhang 
4286219fbbafSJunchao Zhang   PetscFunctionBegin;
42872c4ab24aSJunchao Zhang   // The two MatResetPreallocationCOO_* must be done in order. The former relies on values that might be destroyed by the latter
42889566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
42892c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
42902c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
42912c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
42922c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
42932c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
42942c4ab24aSJunchao Zhang   } else {
42952c4ab24aSJunchao Zhang     i = coo_i;
42962c4ab24aSJunchao Zhang     j = coo_j;
4297219fbbafSJunchao Zhang   }
4298219fbbafSJunchao Zhang 
42992c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j));
43002c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
4301cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
43022c4ab24aSJunchao Zhang   // Create the GPU memory
43039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
43042c4ab24aSJunchao Zhang 
43052c4ab24aSJunchao Zhang   // Copy the COO struct to device
43062c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
43072c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
43082c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
43092c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different
43102c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount)));
43112c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
43122c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount)));
43132c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
43142c4ab24aSJunchao Zhang 
43152c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
43162c4ab24aSJunchao Zhang   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
43172c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetPointer(container_d, coo_d));
43182c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJCUSPARSE));
43192c4ab24aSJunchao Zhang   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
43202c4ab24aSJunchao Zhang   PetscCall(PetscContainerDestroy(&container_d));
43213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4322219fbbafSJunchao Zhang }
4323219fbbafSJunchao Zhang 
4324d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4325d71ae5a4SJacob Faibussowitsch {
4326219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4327219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4328b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4329b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4330b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4331b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4332b6c38306SJunchao Zhang   }
4333219fbbafSJunchao Zhang }
4334219fbbafSJunchao Zhang 
433566976f2fSJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4336d71ae5a4SJacob Faibussowitsch {
4337219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq  = (Mat_SeqAIJ *)A->data;
4338219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4339219fbbafSJunchao Zhang   PetscCount           Annz = seq->nz;
4340219fbbafSJunchao Zhang   PetscMemType         memtype;
4341219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4342219fbbafSJunchao Zhang   PetscScalar         *Aa;
43432c4ab24aSJunchao Zhang   PetscContainer       container;
43442c4ab24aSJunchao Zhang   MatCOOStruct_SeqAIJ *coo;
4345219fbbafSJunchao Zhang 
4346219fbbafSJunchao Zhang   PetscFunctionBegin;
43472c4ab24aSJunchao Zhang   if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
43482c4ab24aSJunchao Zhang 
43492c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
43502c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
43512c4ab24aSJunchao Zhang 
43529566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
4353219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
43542c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
43552c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4356219fbbafSJunchao Zhang   }
4357219fbbafSJunchao Zhang 
43589566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
43599566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4360219fbbafSJunchao Zhang 
436108bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
4362cbc6b225SStefano Zampini   if (Annz) {
43632c4ab24aSJunchao Zhang     MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa);
43649566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
4365cbc6b225SStefano Zampini   }
436608bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
4367219fbbafSJunchao Zhang 
43689566063dSJacob Faibussowitsch   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
43699566063dSJacob Faibussowitsch   else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4370219fbbafSJunchao Zhang 
43719566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
43723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4373219fbbafSJunchao Zhang }
4374219fbbafSJunchao Zhang 
43755b7e41feSStefano Zampini /*@C
43762ef1f0ffSBarry Smith   MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
43775b7e41feSStefano Zampini 
43782ef1f0ffSBarry Smith   Not Collective
43795b7e41feSStefano Zampini 
43805b7e41feSStefano Zampini   Input Parameters:
43815b7e41feSStefano Zampini + A          - the matrix
438211a5261eSBarry Smith - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43835b7e41feSStefano Zampini 
43845b7e41feSStefano Zampini   Output Parameters:
438520f4b53cSBarry Smith + i - the CSR row pointers
438620f4b53cSBarry Smith - j - the CSR column indices
43875b7e41feSStefano Zampini 
43885b7e41feSStefano Zampini   Level: developer
43895b7e41feSStefano Zampini 
439011a5261eSBarry Smith   Note:
43915b7e41feSStefano Zampini   When compressed is true, the CSR structure does not contain empty rows
43925b7e41feSStefano Zampini 
43931cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
43945b7e41feSStefano Zampini @*/
4395d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4396d71ae5a4SJacob Faibussowitsch {
43975f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43985f101d05SStefano Zampini   CsrMatrix          *csr;
43995f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44005f101d05SStefano Zampini 
44015f101d05SStefano Zampini   PetscFunctionBegin;
44025f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44033ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
44045f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4405aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
440728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44085f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44095f101d05SStefano Zampini   if (i) {
44105f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
44115f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
44125f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
44135f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
44149566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
44155f101d05SStefano Zampini       }
44165f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
44175f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
44185f101d05SStefano Zampini   }
44195f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
44203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44215f101d05SStefano Zampini }
44225f101d05SStefano Zampini 
44235b7e41feSStefano Zampini /*@C
44242ef1f0ffSBarry Smith   MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
44255b7e41feSStefano Zampini 
44262ef1f0ffSBarry Smith   Not Collective
44275b7e41feSStefano Zampini 
44285b7e41feSStefano Zampini   Input Parameters:
44295b7e41feSStefano Zampini + A          - the matrix
44302ef1f0ffSBarry Smith . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
443120f4b53cSBarry Smith . i          - the CSR row pointers
443220f4b53cSBarry Smith - j          - the CSR column indices
44335b7e41feSStefano Zampini 
44345b7e41feSStefano Zampini   Level: developer
44355b7e41feSStefano Zampini 
44361cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
44375b7e41feSStefano Zampini @*/
443820f4b53cSBarry Smith PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4439d71ae5a4SJacob Faibussowitsch {
44405f101d05SStefano Zampini   PetscFunctionBegin;
44415f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44425f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44435f101d05SStefano Zampini   if (i) *i = NULL;
44445f101d05SStefano Zampini   if (j) *j = NULL;
444520f4b53cSBarry Smith   (void)compressed;
44463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
44475f101d05SStefano Zampini }
44485f101d05SStefano Zampini 
44495b7e41feSStefano Zampini /*@C
445011a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
44515b7e41feSStefano Zampini 
44525b7e41feSStefano Zampini   Not Collective
44535b7e41feSStefano Zampini 
44545b7e41feSStefano Zampini   Input Parameter:
445511a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
44565b7e41feSStefano Zampini 
44575b7e41feSStefano Zampini   Output Parameter:
44585b7e41feSStefano Zampini . a - pointer to the device data
44595b7e41feSStefano Zampini 
44605b7e41feSStefano Zampini   Level: developer
44615b7e41feSStefano Zampini 
446211a5261eSBarry Smith   Note:
446311a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
44645b7e41feSStefano Zampini 
44651cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
44665b7e41feSStefano Zampini @*/
4467d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4468d71ae5a4SJacob Faibussowitsch {
4469ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4470ed502f03SStefano Zampini   CsrMatrix          *csr;
4471ed502f03SStefano Zampini 
4472ed502f03SStefano Zampini   PetscFunctionBegin;
4473ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44744f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4475ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4476aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
447828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4479ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
448028b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4481ed502f03SStefano Zampini   *a = csr->values->data().get();
44823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4483ed502f03SStefano Zampini }
4484ed502f03SStefano Zampini 
44855b7e41feSStefano Zampini /*@C
448611a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
44875b7e41feSStefano Zampini 
44885b7e41feSStefano Zampini   Not Collective
44895b7e41feSStefano Zampini 
44902ef1f0ffSBarry Smith   Input Parameters:
44912ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
44922ef1f0ffSBarry Smith - a - pointer to the device data
44935b7e41feSStefano Zampini 
44945b7e41feSStefano Zampini   Level: developer
44955b7e41feSStefano Zampini 
44961cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
44975b7e41feSStefano Zampini @*/
4498d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4499d71ae5a4SJacob Faibussowitsch {
4500ed502f03SStefano Zampini   PetscFunctionBegin;
4501ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45024f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4503ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4504ed502f03SStefano Zampini   *a = NULL;
45053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4506ed502f03SStefano Zampini }
4507ed502f03SStefano Zampini 
45085b7e41feSStefano Zampini /*@C
450911a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45105b7e41feSStefano Zampini 
45115b7e41feSStefano Zampini   Not Collective
45125b7e41feSStefano Zampini 
45135b7e41feSStefano Zampini   Input Parameter:
451411a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45155b7e41feSStefano Zampini 
45165b7e41feSStefano Zampini   Output Parameter:
45175b7e41feSStefano Zampini . a - pointer to the device data
45185b7e41feSStefano Zampini 
45195b7e41feSStefano Zampini   Level: developer
45205b7e41feSStefano Zampini 
452111a5261eSBarry Smith   Note:
452211a5261eSBarry Smith   May trigger host-device copies if up-to-date matrix data is on host
45235b7e41feSStefano Zampini 
45241cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
45255b7e41feSStefano Zampini @*/
4526d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4527d71ae5a4SJacob Faibussowitsch {
4528039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4529039c6fbaSStefano Zampini   CsrMatrix          *csr;
4530039c6fbaSStefano Zampini 
4531039c6fbaSStefano Zampini   PetscFunctionBegin;
4532039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45334f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4534039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4535aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
453728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4538039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
453928b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4540039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4541039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
45433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4544039c6fbaSStefano Zampini }
45455b7e41feSStefano Zampini /*@C
454611a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4547039c6fbaSStefano Zampini 
45485b7e41feSStefano Zampini   Not Collective
45495b7e41feSStefano Zampini 
45502ef1f0ffSBarry Smith   Input Parameters:
45512ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
45522ef1f0ffSBarry Smith - a - pointer to the device data
45535b7e41feSStefano Zampini 
45545b7e41feSStefano Zampini   Level: developer
45555b7e41feSStefano Zampini 
45561cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
45575b7e41feSStefano Zampini @*/
4558d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4559d71ae5a4SJacob Faibussowitsch {
4560039c6fbaSStefano Zampini   PetscFunctionBegin;
4561039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45624f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4563039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45659566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4566039c6fbaSStefano Zampini   *a = NULL;
45673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4568039c6fbaSStefano Zampini }
4569039c6fbaSStefano Zampini 
45705b7e41feSStefano Zampini /*@C
457111a5261eSBarry Smith   MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45725b7e41feSStefano Zampini 
45735b7e41feSStefano Zampini   Not Collective
45745b7e41feSStefano Zampini 
45755b7e41feSStefano Zampini   Input Parameter:
457611a5261eSBarry Smith . A - a `MATSEQAIJCUSPARSE` matrix
45775b7e41feSStefano Zampini 
45785b7e41feSStefano Zampini   Output Parameter:
45795b7e41feSStefano Zampini . a - pointer to the device data
45805b7e41feSStefano Zampini 
45815b7e41feSStefano Zampini   Level: developer
45825b7e41feSStefano Zampini 
458311a5261eSBarry Smith   Note:
458411a5261eSBarry Smith   Does not trigger host-device copies and flags data validity on the GPU
45855b7e41feSStefano Zampini 
45861cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
45875b7e41feSStefano Zampini @*/
4588d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4589d71ae5a4SJacob Faibussowitsch {
4590ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4591ed502f03SStefano Zampini   CsrMatrix          *csr;
4592ed502f03SStefano Zampini 
4593ed502f03SStefano Zampini   PetscFunctionBegin;
4594ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
45954f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4596ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4597aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
459828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4599ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
460028b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4601ed502f03SStefano Zampini   *a             = csr->values->data().get();
4602039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
46043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4605ed502f03SStefano Zampini }
4606ed502f03SStefano Zampini 
46075b7e41feSStefano Zampini /*@C
460811a5261eSBarry Smith   MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46095b7e41feSStefano Zampini 
46105b7e41feSStefano Zampini   Not Collective
46115b7e41feSStefano Zampini 
46122ef1f0ffSBarry Smith   Input Parameters:
46132ef1f0ffSBarry Smith + A - a `MATSEQAIJCUSPARSE` matrix
46142ef1f0ffSBarry Smith - a - pointer to the device data
46155b7e41feSStefano Zampini 
46165b7e41feSStefano Zampini   Level: developer
46175b7e41feSStefano Zampini 
46181cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
46195b7e41feSStefano Zampini @*/
4620d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4621d71ae5a4SJacob Faibussowitsch {
4622ed502f03SStefano Zampini   PetscFunctionBegin;
4623ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
46244f572ea9SToby Isaac   PetscAssertPointer(a, 2);
4625ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46279566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4628ed502f03SStefano Zampini   *a = NULL;
46293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4630ed502f03SStefano Zampini }
4631ed502f03SStefano Zampini 
46329371c9d4SSatish Balay struct IJCompare4 {
4633d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4634d71ae5a4SJacob Faibussowitsch   {
46350b156cc8SJunchao Zhang     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
46360b156cc8SJunchao Zhang     if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2);
4637ed502f03SStefano Zampini     return false;
4638ed502f03SStefano Zampini   }
4639ed502f03SStefano Zampini };
4640ed502f03SStefano Zampini 
46419371c9d4SSatish Balay struct Shift {
4642ed502f03SStefano Zampini   int _shift;
4643ed502f03SStefano Zampini 
4644ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
46459371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4646ed502f03SStefano Zampini };
4647ed502f03SStefano Zampini 
464821afe8ebSBarry Smith /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */
4649d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4650d71ae5a4SJacob Faibussowitsch {
4651ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4652ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4653ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4654ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4655ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4656ed502f03SStefano Zampini   cusparseStatus_t              stat;
4657ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4658ed502f03SStefano Zampini 
4659ed502f03SStefano Zampini   PetscFunctionBegin;
4660ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4661ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
46624f572ea9SToby Isaac   PetscAssertPointer(C, 4);
4663ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4664ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
46655f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
466608401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4667aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4668aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4669ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4670ed502f03SStefano Zampini     m = A->rmap->n;
4671ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
46729566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
46739566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
46749566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4675ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4676ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4677ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4678ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4679ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4680ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4681ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4682ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4683ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4684ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4685ed502f03SStefano Zampini     Ccusp->nrows            = m;
4686ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4687ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4688ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4689ed502f03SStefano Zampini     Ccsr->num_cols          = n;
46909566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
46919566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
46929566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4693f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar)));
4694f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar)));
4695f4f49eeaSPierre Jolivet     PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar)));
46969566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46979566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46989566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46999566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47009566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
470128b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
470228b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4703ed502f03SStefano Zampini 
4704ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4705ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4706ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4707ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4708ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4709ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4710ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4711ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4712ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
47132c4ab24aSJunchao Zhang     Ccusp->coords        = new THRUSTINTARRAY(c->nz);
4714ed502f03SStefano Zampini     if (c->nz) {
47152ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
47162ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
47172ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
47182ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
47192ed87e7eSStefano Zampini 
4720ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4721ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4722ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4723ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
47249566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4725ed502f03SStefano Zampini         }
47262ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
47272ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4728ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4729ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4730ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4731ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
47329566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4733ed502f03SStefano Zampini         }
47342ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
47352ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
47369566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
47379371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47389371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47399371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47409371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47412ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
47422ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
47432ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
47448909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4745ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4746ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
47478909a122SStefano Zampini #else
47488909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
47498909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
47508909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
47518909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
47528909a122SStefano Zampini #endif
47532ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
47542ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
47552ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
47562ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
47572ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
47582ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
47592c4ab24aSJunchao Zhang       auto p1    = Ccusp->coords->begin();
47602c4ab24aSJunchao Zhang       auto p2    = Ccusp->coords->begin();
4761ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4762792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
47638909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
47648909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
47658909a122SStefano Zampini #endif
47662ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
47672ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
47682ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4769792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
47702ed87e7eSStefano Zampini #else
47712ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4772792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4773792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
47742ed87e7eSStefano Zampini #endif
47759371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47769371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47779566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
47782ed87e7eSStefano Zampini       delete wPerm;
47792ed87e7eSStefano Zampini       delete Acoo;
47802ed87e7eSStefano Zampini       delete Bcoo;
47812ed87e7eSStefano Zampini       delete Ccoo;
4782ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47839371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47849371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4785ed502f03SStefano Zampini #endif
47861a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
47879566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
47889566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4789ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4790ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4791ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4792ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4793ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4794ed502f03SStefano Zampini 
47951a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
47961a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4797a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4798ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4799ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4800ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4801ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4802ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4803ed502f03SStefano Zampini 
4804ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4805ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4806ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4807ed502f03SStefano Zampini 
48089566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4809ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4810ed502f03SStefano Zampini         if (AT) {
4811ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4812ed502f03SStefano Zampini           thrust::advance(rT, -1);
4813ed502f03SStefano Zampini         }
4814ed502f03SStefano Zampini         if (BT) {
4815ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4816ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4817ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4818ed502f03SStefano Zampini         }
4819ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4820ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4821ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4822ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4823ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4824ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48259566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4826ed502f03SStefano Zampini 
48279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
48289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
48299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
4830f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar)));
4831f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar)));
4832f4f49eeaSPierre Jolivet         PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar)));
48339566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48349566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48359566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4836ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48379371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48389371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4839ed502f03SStefano Zampini #endif
4840ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4841ed502f03SStefano Zampini       }
4842ed502f03SStefano Zampini     }
4843ed502f03SStefano Zampini 
4844ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4845ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4846ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
48479566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
48489566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
48497de69702SBarry Smith     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */
4850ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4851ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4852ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4853ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
48549566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
48559566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4856ed502f03SStefano Zampini     } else {
48579566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
48589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4859ed502f03SStefano Zampini     }
48609566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
48619566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
48629566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4863ed502f03SStefano Zampini     c->maxnz         = c->nz;
4864ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4865ed502f03SStefano Zampini     c->rmax          = 0;
4866ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4867ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4868ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4869ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4870ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4871ed502f03SStefano Zampini     }
48729566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
48739566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4874ed502f03SStefano Zampini     (*C)->nonzerostate++;
48759566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
48769566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4877ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4878ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4879ed502f03SStefano Zampini   } else {
488008401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4881ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4882ed502f03SStefano Zampini     if (c->nz) {
4883ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
48842c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords");
4885aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
488608401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
48879566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
48889566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
48895f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
48905f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4891ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4892ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4893ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4894aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4895aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4896aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4897aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
48982c4ab24aSJunchao Zhang       PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size());
48992c4ab24aSJunchao Zhang       auto pmid = Ccusp->coords->begin();
4900ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
49019566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49022c4ab24aSJunchao Zhang       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin())));
49039371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4904ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49059371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
49062c4ab24aSJunchao Zhang       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end())));
4907ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
49089566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
49091a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
49105f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4911ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4912ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4913ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4914ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4915ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4916ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4917ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49181a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4919ed502f03SStefano Zampini       }
49209566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4921ed502f03SStefano Zampini     }
4922ed502f03SStefano Zampini   }
49239566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4924ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4925ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4926ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
49273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4928ed502f03SStefano Zampini }
4929c215019aSStefano Zampini 
4930d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4931d71ae5a4SJacob Faibussowitsch {
4932c215019aSStefano Zampini   bool               dmem;
4933c215019aSStefano Zampini   const PetscScalar *av;
4934c215019aSStefano Zampini 
4935c215019aSStefano Zampini   PetscFunctionBegin;
4936c215019aSStefano Zampini   dmem = isCudaMem(v);
49379566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4938c215019aSStefano Zampini   if (n && idx) {
4939c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4940c215019aSStefano Zampini     widx.assign(idx, idx + n);
49419566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4942c215019aSStefano Zampini 
4943c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4944c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4945c215019aSStefano Zampini     if (dmem) {
4946c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4947c215019aSStefano Zampini     } else {
4948c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4949c215019aSStefano Zampini       dv = w->data();
4950c215019aSStefano Zampini     }
4951c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4952c215019aSStefano Zampini 
4953c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4954c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4955c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
495648a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4957c215019aSStefano Zampini     delete w;
4958c215019aSStefano Zampini   } else {
49599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4960c215019aSStefano Zampini   }
49619566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
49629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
49633ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4964c215019aSStefano Zampini }
4965