xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 995bce04e36fa350fc08696b5ebe4b00e1092023)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
19d0967f54SJacob Faibussowitsch #endif
20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
21a2cee5feSJed Brown #include <thrust/remove.h>
22a2cee5feSJed Brown #include <thrust/sort.h>
23a2cee5feSJed Brown #include <thrust/unique.h>
24e8d2b73aSMark Adams 
25e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
35afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
36afb2bd1cSJunchao Zhang 
37afb2bd1cSJunchao Zhang   typedef enum {
38afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
50afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
51afb2bd1cSJunchao Zhang 
52afb2bd1cSJunchao Zhang   typedef enum {
5335cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
5435cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
56afb2bd1cSJunchao Zhang   */
57afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60afb2bd1cSJunchao Zhang #endif
619ae82921SPaul Mullowney 
62087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65087f3262SPaul Mullowney 
666fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
676fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
686fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
69087f3262SPaul Mullowney 
706fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
726fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
776fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
786fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
849ae82921SPaul Mullowney 
857f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
907f756511SDominic Meiser 
9157181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9357181aedSStefano Zampini 
94c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
97c215019aSStefano Zampini 
98d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99d71ae5a4SJacob Faibussowitsch {
100aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1016e111a19SKarl Rupp 
102ca45077fSPaul Mullowney   PetscFunctionBegin;
103ca45077fSPaul Mullowney   switch (op) {
104d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
105d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
106d71ae5a4SJacob Faibussowitsch     break;
107d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
108d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
109d71ae5a4SJacob Faibussowitsch     break;
110d71ae5a4SJacob Faibussowitsch   default:
111d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112ca45077fSPaul Mullowney   }
1133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
114ca45077fSPaul Mullowney }
1159ae82921SPaul Mullowney 
116e057df02SPaul Mullowney /*@
11711a5261eSBarry Smith    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11811a5261eSBarry Smith    operation. Only the `MatMult()` operation can use different GPU storage formats
11911a5261eSBarry Smith 
120e057df02SPaul Mullowney    Not Collective
121e057df02SPaul Mullowney 
122e057df02SPaul Mullowney    Input Parameters:
12311a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
1242ef1f0ffSBarry Smith .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`.
1252ef1f0ffSBarry Smith         `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12611a5261eSBarry Smith -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
127e057df02SPaul Mullowney 
128e057df02SPaul Mullowney    Level: intermediate
129e057df02SPaul Mullowney 
1302ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
131e057df02SPaul Mullowney @*/
132d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
133d71ae5a4SJacob Faibussowitsch {
134e057df02SPaul Mullowney   PetscFunctionBegin;
135e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
136cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
1373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138e057df02SPaul Mullowney }
139e057df02SPaul Mullowney 
140d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
141d71ae5a4SJacob Faibussowitsch {
142365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
143365b711fSMark Adams 
144365b711fSMark Adams   PetscFunctionBegin;
145365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
1463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
147365b711fSMark Adams }
148365b711fSMark Adams 
149365b711fSMark Adams /*@
15011a5261eSBarry Smith    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
151365b711fSMark Adams 
152365b711fSMark Adams    Input Parameters:
15311a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
15411a5261eSBarry Smith -  use_cpu - set flag for using the built-in CPU `MatSolve()`
155365b711fSMark Adams 
1562ef1f0ffSBarry Smith    Level: intermediate
157365b711fSMark Adams 
15811a5261eSBarry Smith    Note:
159365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
160365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
161365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
162365b711fSMark Adams 
1632ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164365b711fSMark Adams @*/
165d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
166d71ae5a4SJacob Faibussowitsch {
167365b711fSMark Adams   PetscFunctionBegin;
168365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
169cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
1703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
171365b711fSMark Adams }
172365b711fSMark Adams 
173d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
174d71ae5a4SJacob Faibussowitsch {
175e6e9a74fSStefano Zampini   PetscFunctionBegin;
1761a2c6b5cSJunchao Zhang   switch (op) {
1771a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1781a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1799566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1801a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1811a2c6b5cSJunchao Zhang     break;
182d71ae5a4SJacob Faibussowitsch   default:
183d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
184d71ae5a4SJacob Faibussowitsch     break;
185e6e9a74fSStefano Zampini   }
1863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
187e6e9a74fSStefano Zampini }
188e6e9a74fSStefano Zampini 
189bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190bddcd29dSMark Adams 
191d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
192d71ae5a4SJacob Faibussowitsch {
193bddcd29dSMark Adams   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
194bddcd29dSMark Adams   IS                  isrow = b->row, iscol = b->col;
195bddcd29dSMark Adams   PetscBool           row_identity, col_identity;
196365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
197bddcd29dSMark Adams 
198bddcd29dSMark Adams   PetscFunctionBegin;
1999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2009566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
201bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
202bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2039566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
2049566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
205f93f8571SJunchao Zhang 
206365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
207f93f8571SJunchao Zhang     if (row_identity && col_identity) {
208bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210bddcd29dSMark Adams     } else {
211bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
212bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213365b711fSMark Adams     }
214f93f8571SJunchao Zhang   }
215bddcd29dSMark Adams   B->ops->matsolve          = NULL;
216bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
217bddcd29dSMark Adams 
218bddcd29dSMark Adams   /* get the triangular factors */
21948a46eb9SPierre Jolivet   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
2203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
221bddcd29dSMark Adams }
222bddcd29dSMark Adams 
223d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
224d71ae5a4SJacob Faibussowitsch {
225e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2269ae82921SPaul Mullowney   PetscBool                flg;
227a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2286e111a19SKarl Rupp 
2299ae82921SPaul Mullowney   PetscFunctionBegin;
230d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
2319ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2329371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2339566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
234afb2bd1cSJunchao Zhang 
2359371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2369566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2379566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2389566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
239afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2409371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
241afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
242ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
243aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
244a435da06SStefano Zampini   #else
245aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
246a435da06SStefano Zampini   #endif
2479371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
248aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
249afb2bd1cSJunchao Zhang 
2509371c9d4SSatish Balay     PetscCall(
2519371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
252aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
253afb2bd1cSJunchao Zhang #endif
2544c87dfd4SPaul Mullowney   }
255d0609cedSBarry Smith   PetscOptionsHeadEnd();
2563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2579ae82921SPaul Mullowney }
2589ae82921SPaul Mullowney 
259d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
260d71ae5a4SJacob Faibussowitsch {
2619ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
2629ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
2639ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
264aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
2659ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
2669ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
2679ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2689ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
2699ae82921SPaul Mullowney 
2709ae82921SPaul Mullowney   PetscFunctionBegin;
2713ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
272c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2739ae82921SPaul Mullowney     try {
2749ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2759ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
276da79fbbcSStefano Zampini       if (!loTriFactor) {
2772cbc15d9SMark         PetscScalar *AALo;
2782cbc15d9SMark 
2799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
2809ae82921SPaul Mullowney 
2819ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
2839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
2849ae82921SPaul Mullowney 
2859ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2869ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
2879ae82921SPaul Mullowney         AiLo[n]   = nzLower;
2889ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
2899ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
2909ae82921SPaul Mullowney         v         = aa;
2919ae82921SPaul Mullowney         vi        = aj;
2929ae82921SPaul Mullowney         offset    = 1;
2939ae82921SPaul Mullowney         rowOffset = 1;
2949ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
2959ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
296e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
2979ae82921SPaul Mullowney           AiLo[i] = rowOffset;
2989ae82921SPaul Mullowney           rowOffset += nz + 1;
2999ae82921SPaul Mullowney 
3009566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
3019566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
3029ae82921SPaul Mullowney 
3039ae82921SPaul Mullowney           offset += nz;
3049ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3059ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3069ae82921SPaul Mullowney           offset += 1;
3079ae82921SPaul Mullowney 
3089ae82921SPaul Mullowney           v += nz;
3099ae82921SPaul Mullowney           vi += nz;
3109ae82921SPaul Mullowney         }
3112205254eSKarl Rupp 
312aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3139566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
314da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
315aa372e3fSPaul Mullowney         /* Create the matrix description */
3169566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3181b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3199566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
320afb2bd1cSJunchao Zhang #else
3219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
322afb2bd1cSJunchao Zhang #endif
3239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
325aa372e3fSPaul Mullowney 
326aa372e3fSPaul Mullowney         /* set the operation */
327aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
328aa372e3fSPaul Mullowney 
329aa372e3fSPaul Mullowney         /* set the matrix */
330aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
331aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
332aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
333aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
334aa372e3fSPaul Mullowney 
335aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
336aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
337aa372e3fSPaul Mullowney 
338aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
339aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
340aa372e3fSPaul Mullowney 
341aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
342aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
343aa372e3fSPaul Mullowney 
344afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3459566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
346261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3471b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3489371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3499371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3509566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
351afb2bd1cSJunchao Zhang #endif
352afb2bd1cSJunchao Zhang 
353aa372e3fSPaul Mullowney         /* perform the solve analysis */
3549371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3559f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
3569566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3579566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
358aa372e3fSPaul Mullowney 
359da79fbbcSStefano Zampini         /* assign the pointer */
360aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
3612cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
3629566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3639566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3649566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
365da79fbbcSStefano Zampini       } else { /* update values only */
36648a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
367da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3682cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
369da79fbbcSStefano Zampini         v                    = aa;
370da79fbbcSStefano Zampini         vi                   = aj;
371da79fbbcSStefano Zampini         offset               = 1;
372da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
373da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
3749566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
375da79fbbcSStefano Zampini           offset += nz;
3762cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
377da79fbbcSStefano Zampini           offset += 1;
378da79fbbcSStefano Zampini           v += nz;
379da79fbbcSStefano Zampini         }
3802cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
3819566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
382da79fbbcSStefano Zampini       }
383d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
384d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
385d71ae5a4SJacob Faibussowitsch     }
3869ae82921SPaul Mullowney   }
3873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3889ae82921SPaul Mullowney }
3899ae82921SPaul Mullowney 
390d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
391d71ae5a4SJacob Faibussowitsch {
3929ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3939ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3949ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
395aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
3969ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
3979ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3989ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3999ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4009ae82921SPaul Mullowney 
4019ae82921SPaul Mullowney   PetscFunctionBegin;
4023ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
403c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4049ae82921SPaul Mullowney     try {
4059ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4069ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
407da79fbbcSStefano Zampini       if (!upTriFactor) {
4082cbc15d9SMark         PetscScalar *AAUp;
4092cbc15d9SMark 
4109566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4112cbc15d9SMark 
4129ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4139566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4149566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4159ae82921SPaul Mullowney 
4169ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4179ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4189ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4199ae82921SPaul Mullowney         offset  = nzUpper;
4209ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4219ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4229ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4239ae82921SPaul Mullowney 
424e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4259ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4269ae82921SPaul Mullowney 
427e057df02SPaul Mullowney           /* decrement the offset */
4289ae82921SPaul Mullowney           offset -= (nz + 1);
4299ae82921SPaul Mullowney 
430e057df02SPaul Mullowney           /* first, set the diagonal elements */
4319ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
43209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4339ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4349ae82921SPaul Mullowney 
4359566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4369566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4379ae82921SPaul Mullowney         }
4382205254eSKarl Rupp 
439aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4409566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
441da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4422205254eSKarl Rupp 
443aa372e3fSPaul Mullowney         /* Create the matrix description */
4449566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4459566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4479566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
448afb2bd1cSJunchao Zhang #else
4499566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
450afb2bd1cSJunchao Zhang #endif
4519566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
453aa372e3fSPaul Mullowney 
454aa372e3fSPaul Mullowney         /* set the operation */
455aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
456aa372e3fSPaul Mullowney 
457aa372e3fSPaul Mullowney         /* set the matrix */
458aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
459aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
460aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
461aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
462aa372e3fSPaul Mullowney 
463aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
464aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
465aa372e3fSPaul Mullowney 
466aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
467aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
468aa372e3fSPaul Mullowney 
469aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
470aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
471aa372e3fSPaul Mullowney 
472afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4739566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
474261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4769371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4779371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
4789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
479afb2bd1cSJunchao Zhang #endif
480afb2bd1cSJunchao Zhang 
481aa372e3fSPaul Mullowney         /* perform the solve analysis */
4829371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4839f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
4849f7ba44dSJacob Faibussowitsch 
4859566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4869566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
487aa372e3fSPaul Mullowney 
488da79fbbcSStefano Zampini         /* assign the pointer */
489aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
4902cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
4919566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
4929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
4939566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
494da79fbbcSStefano Zampini       } else {
49548a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
496da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
497da79fbbcSStefano Zampini         offset = nzUpper;
498da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
499da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
500da79fbbcSStefano Zampini 
501da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
502da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
503da79fbbcSStefano Zampini 
504da79fbbcSStefano Zampini           /* decrement the offset */
505da79fbbcSStefano Zampini           offset -= (nz + 1);
506da79fbbcSStefano Zampini 
507da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5082cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5099566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
510da79fbbcSStefano Zampini         }
5112cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5129566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
513da79fbbcSStefano Zampini       }
514d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
515d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
516d71ae5a4SJacob Faibussowitsch     }
5179ae82921SPaul Mullowney   }
5183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5199ae82921SPaul Mullowney }
5209ae82921SPaul Mullowney 
521d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
522d71ae5a4SJacob Faibussowitsch {
5239ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5249ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5259ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5269ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5279ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5289ae82921SPaul Mullowney 
5299ae82921SPaul Mullowney   PetscFunctionBegin;
53028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
5319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5332205254eSKarl Rupp 
534ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
535aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5369ae82921SPaul Mullowney 
537c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
538e057df02SPaul Mullowney   /* lower triangular indices */
5399566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
540da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
541da79fbbcSStefano Zampini     const PetscInt *r;
542da79fbbcSStefano Zampini 
5439566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
544aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
545aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5469566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5479566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
548da79fbbcSStefano Zampini   }
5499ae82921SPaul Mullowney 
550e057df02SPaul Mullowney   /* upper triangular indices */
5519566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
552da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
553da79fbbcSStefano Zampini     const PetscInt *c;
554da79fbbcSStefano Zampini 
5559566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
556aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
557aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
5589566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
5599566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
560da79fbbcSStefano Zampini   }
5613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5629ae82921SPaul Mullowney }
5639ae82921SPaul Mullowney 
564d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
565d71ae5a4SJacob Faibussowitsch {
566087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
567087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
568aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
569aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
570087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
571087f3262SPaul Mullowney   PetscScalar                       *AAUp;
572087f3262SPaul Mullowney   PetscScalar                       *AALo;
573087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
574087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
575087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
576087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
577087f3262SPaul Mullowney 
578087f3262SPaul Mullowney   PetscFunctionBegin;
5793ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
580c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
581087f3262SPaul Mullowney     try {
5829566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
5839566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
584da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
585087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
5879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
588087f3262SPaul Mullowney 
589087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
590087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
591087f3262SPaul Mullowney         AiUp[n] = nzUpper;
592087f3262SPaul Mullowney         offset  = 0;
593087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
594087f3262SPaul Mullowney           /* set the pointers */
595087f3262SPaul Mullowney           v  = aa + ai[i];
596087f3262SPaul Mullowney           vj = aj + ai[i];
597087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
598087f3262SPaul Mullowney 
599087f3262SPaul Mullowney           /* first, set the diagonal elements */
600087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
60109f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
602087f3262SPaul Mullowney           AiUp[i]      = offset;
60309f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
604087f3262SPaul Mullowney 
605087f3262SPaul Mullowney           offset += 1;
606087f3262SPaul Mullowney           if (nz > 0) {
6079566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
6089566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
609087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
610087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
611087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
612087f3262SPaul Mullowney             }
613087f3262SPaul Mullowney             offset += nz;
614087f3262SPaul Mullowney           }
615087f3262SPaul Mullowney         }
616087f3262SPaul Mullowney 
617aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6189566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
619da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
620087f3262SPaul Mullowney 
621aa372e3fSPaul Mullowney         /* Create the matrix description */
6229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
626afb2bd1cSJunchao Zhang #else
6279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
628afb2bd1cSJunchao Zhang #endif
6299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
631087f3262SPaul Mullowney 
632aa372e3fSPaul Mullowney         /* set the matrix */
633aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
634aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
635aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
636aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
637aa372e3fSPaul Mullowney 
638aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
639aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
640aa372e3fSPaul Mullowney 
641aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
642aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
643aa372e3fSPaul Mullowney 
644aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
645aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
646aa372e3fSPaul Mullowney 
647afb2bd1cSJunchao Zhang         /* set the operation */
648afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
649afb2bd1cSJunchao Zhang 
650afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6519566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
652261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6531b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6549371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6559371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
6569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
657afb2bd1cSJunchao Zhang #endif
658afb2bd1cSJunchao Zhang 
659aa372e3fSPaul Mullowney         /* perform the solve analysis */
6609371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6619f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
6629f7ba44dSJacob Faibussowitsch 
6639566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6649566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
665aa372e3fSPaul Mullowney 
666da79fbbcSStefano Zampini         /* assign the pointer */
667aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
668aa372e3fSPaul Mullowney 
669aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6709566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
671da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
672aa372e3fSPaul Mullowney 
673aa372e3fSPaul Mullowney         /* Create the matrix description */
6749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
6759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
678afb2bd1cSJunchao Zhang #else
6799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
680afb2bd1cSJunchao Zhang #endif
6819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6829566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
683aa372e3fSPaul Mullowney 
684aa372e3fSPaul Mullowney         /* set the operation */
685aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
686aa372e3fSPaul Mullowney 
687aa372e3fSPaul Mullowney         /* set the matrix */
688aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
689aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
690aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
691aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
692aa372e3fSPaul Mullowney 
693aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
694aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
695aa372e3fSPaul Mullowney 
696aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
697aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
698aa372e3fSPaul Mullowney 
699aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
700aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
701aa372e3fSPaul Mullowney 
702afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7039566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
704261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
7051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7069371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7079371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
7089566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
709afb2bd1cSJunchao Zhang #endif
710afb2bd1cSJunchao Zhang 
711aa372e3fSPaul Mullowney         /* perform the solve analysis */
7129371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7139f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
7149f7ba44dSJacob Faibussowitsch 
7159566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7169566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
717aa372e3fSPaul Mullowney 
718da79fbbcSStefano Zampini         /* assign the pointer */
719aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
720087f3262SPaul Mullowney 
7219566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
7229566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
724da79fbbcSStefano Zampini       } else {
725da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
726da79fbbcSStefano Zampini         offset = 0;
727da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
728da79fbbcSStefano Zampini           /* set the pointers */
729da79fbbcSStefano Zampini           v  = aa + ai[i];
730da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
731da79fbbcSStefano Zampini 
732da79fbbcSStefano Zampini           /* first, set the diagonal elements */
733da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
734da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
735da79fbbcSStefano Zampini 
736da79fbbcSStefano Zampini           offset += 1;
737da79fbbcSStefano Zampini           if (nz > 0) {
7389566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
739da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
740da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
741da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
742da79fbbcSStefano Zampini             }
743da79fbbcSStefano Zampini             offset += nz;
744da79fbbcSStefano Zampini           }
745da79fbbcSStefano Zampini         }
74628b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
74728b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
748da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
749da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
7509566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
751da79fbbcSStefano Zampini       }
7529566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
7539566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
754d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
755d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
756d71ae5a4SJacob Faibussowitsch     }
757087f3262SPaul Mullowney   }
7583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
759087f3262SPaul Mullowney }
760087f3262SPaul Mullowney 
761d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
762d71ae5a4SJacob Faibussowitsch {
763087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
764087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
765087f3262SPaul Mullowney   IS                            ip                 = a->row;
766087f3262SPaul Mullowney   PetscBool                     perm_identity;
767087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
768087f3262SPaul Mullowney 
769087f3262SPaul Mullowney   PetscFunctionBegin;
77028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
7719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
772ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
773aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
774aa372e3fSPaul Mullowney 
775da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
776da79fbbcSStefano Zampini 
777087f3262SPaul Mullowney   /* lower triangular indices */
7789566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
779087f3262SPaul Mullowney   if (!perm_identity) {
7804e4bbfaaSStefano Zampini     IS              iip;
781da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
7824e4bbfaaSStefano Zampini 
7839566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
7849566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
7859566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
786aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
787aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
788aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7894e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
7909566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
7919566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
7929566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
7939566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
794da79fbbcSStefano Zampini   }
7953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
796087f3262SPaul Mullowney }
797087f3262SPaul Mullowney 
798d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
799d71ae5a4SJacob Faibussowitsch {
800087f3262SPaul Mullowney   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
801087f3262SPaul Mullowney   IS          ip = b->row;
802087f3262SPaul Mullowney   PetscBool   perm_identity;
803087f3262SPaul Mullowney 
804087f3262SPaul Mullowney   PetscFunctionBegin;
8059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
8069566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
807ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
808087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8099566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
810087f3262SPaul Mullowney   if (perm_identity) {
811087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
812087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8134e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8144e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
815087f3262SPaul Mullowney   } else {
816087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
817087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
8184e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8194e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
820087f3262SPaul Mullowney   }
821087f3262SPaul Mullowney 
822087f3262SPaul Mullowney   /* get the triangular factors */
8239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
8243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
825087f3262SPaul Mullowney }
8269ae82921SPaul Mullowney 
827d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
828d71ae5a4SJacob Faibussowitsch {
829bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
830aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
831aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
832da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
833da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
834aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
835aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
836aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
837aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
838b175d8bbSPaul Mullowney 
839bda325fcSPaul Mullowney   PetscFunctionBegin;
840aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8419566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
842da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
843aa372e3fSPaul Mullowney 
844aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
845aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
846aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
8479371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
848aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney   /* Create the matrix description */
8519566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
8529566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
8539566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
8549566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
8559566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
856aa372e3fSPaul Mullowney 
857aa372e3fSPaul Mullowney   /* set the operation */
858aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
859aa372e3fSPaul Mullowney 
860aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
861aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
862afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
863afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
864aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
865afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
866afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
867afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
868aa372e3fSPaul Mullowney 
869aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
870afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8719371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
8729371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
8739371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
8749566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
875afb2bd1cSJunchao Zhang #endif
876afb2bd1cSJunchao Zhang 
8779566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
8789f7ba44dSJacob Faibussowitsch   {
8799f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
8809f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
8819371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
882afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8839f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
884afb2bd1cSJunchao Zhang #else
8859f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
886afb2bd1cSJunchao Zhang #endif
8879f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
8889f7ba44dSJacob Faibussowitsch   }
8899f7ba44dSJacob Faibussowitsch 
8909566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8919566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
892aa372e3fSPaul Mullowney 
893afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
8949566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
895261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
8961b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8979371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
8989371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
8999566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
900afb2bd1cSJunchao Zhang #endif
901afb2bd1cSJunchao Zhang 
902afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9039371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
9049f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
9059f7ba44dSJacob Faibussowitsch 
9069566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9079566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
908aa372e3fSPaul Mullowney 
909da79fbbcSStefano Zampini   /* assign the pointer */
910aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
911aa372e3fSPaul Mullowney 
912aa372e3fSPaul Mullowney   /*********************************************/
913aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
914aa372e3fSPaul Mullowney   /*********************************************/
915aa372e3fSPaul Mullowney 
916aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9179566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
918da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
919aa372e3fSPaul Mullowney 
920aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
921aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
922aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
9239371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
924aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
925aa372e3fSPaul Mullowney 
926aa372e3fSPaul Mullowney   /* Create the matrix description */
9279566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9289566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
9299566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
9309566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
9319566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
932aa372e3fSPaul Mullowney 
933aa372e3fSPaul Mullowney   /* set the operation */
934aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
935aa372e3fSPaul Mullowney 
936aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
937aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
938afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
939afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
940aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
941afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
942afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
943afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
944aa372e3fSPaul Mullowney 
945aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
946afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9479371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
9489371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
9499371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
9509566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
951afb2bd1cSJunchao Zhang #endif
952afb2bd1cSJunchao Zhang 
9539566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9549f7ba44dSJacob Faibussowitsch   {
9559f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
9569f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
9579371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
958afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9599f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
960afb2bd1cSJunchao Zhang #else
9619f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
962afb2bd1cSJunchao Zhang #endif
9639f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
9649f7ba44dSJacob Faibussowitsch   }
965d49cd2b7SBarry Smith 
9669566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9679566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
968aa372e3fSPaul Mullowney 
969afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9709566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
971261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
9721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9739371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9749371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
9759566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
976afb2bd1cSJunchao Zhang #endif
977afb2bd1cSJunchao Zhang 
978afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9795f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
9809371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9819f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
982d49cd2b7SBarry Smith 
9839566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9849566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
985aa372e3fSPaul Mullowney 
986da79fbbcSStefano Zampini   /* assign the pointer */
987aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
9883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
989bda325fcSPaul Mullowney }
990bda325fcSPaul Mullowney 
9919371c9d4SSatish Balay struct PetscScalarToPetscInt {
9929371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
993a49f1ed0SStefano Zampini };
994a49f1ed0SStefano Zampini 
995d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
996d71ae5a4SJacob Faibussowitsch {
997aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
998a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
999bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1000bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1001aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1002b175d8bbSPaul Mullowney 
1003bda325fcSPaul Mullowney   PetscFunctionBegin;
10049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1005a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
100628b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1007a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
100808401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
10093ba16761SJacob Faibussowitsch   if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS);
10109566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
101248a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1013a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1014aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
10159566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1016aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
10179566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
10189566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1019aa372e3fSPaul Mullowney 
1020b06137fdSPaul Mullowney     /* set alpha and beta */
10219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
10229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
10239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
10249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10259566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10269566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1027b06137fdSPaul Mullowney 
1028aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1029aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1030a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1031554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1032554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1033aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1034a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1035aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1036aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1037a3fdcf43SKarl Rupp 
1038ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
103981902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1040afb2bd1cSJunchao Zhang 
1041afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10423606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
10439371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
10449371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
10459371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
10463606e59fSJunchao Zhang   #else
10473606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
10483606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
10493606e59fSJunchao Zhang 
10503606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
10513606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
10523606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
10533606e59fSJunchao Zhang         */
10543606e59fSJunchao Zhang       if (matrixT->num_entries) {
10559371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
10569371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
10573606e59fSJunchao Zhang 
10583606e59fSJunchao Zhang       } else {
10593606e59fSJunchao Zhang         matstructT->matDescr = NULL;
10603606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
10613606e59fSJunchao Zhang       }
10623606e59fSJunchao Zhang   #endif
1063afb2bd1cSJunchao Zhang #endif
1064aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1065afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1066afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1067afb2bd1cSJunchao Zhang #else
1068aa372e3fSPaul Mullowney       CsrMatrix *temp = new CsrMatrix;
106951c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
107051c6d536SStefano Zampini       /* First convert HYB to CSR */
1071aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1072aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1073aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1074aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1075aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1076aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1077aa372e3fSPaul Mullowney 
10789371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
10799371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1080aa372e3fSPaul Mullowney 
1081aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1082aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1083aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1084aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1085aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1086aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1087aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1088aa372e3fSPaul Mullowney 
10899371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
10909371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
10919371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1092aa372e3fSPaul Mullowney 
1093aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1094aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
10959566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
10969371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
10979371c9d4SSatish Balay       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
10989371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1099aa372e3fSPaul Mullowney 
1100aa372e3fSPaul Mullowney       /* assign the pointer */
1101aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
11021a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1103aa372e3fSPaul Mullowney       /* delete temporaries */
1104aa372e3fSPaul Mullowney       if (tempT) {
1105aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1106aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1107aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1108aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1109087f3262SPaul Mullowney       }
1110aa372e3fSPaul Mullowney       if (temp) {
1111aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1112aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1113aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1114aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1115aa372e3fSPaul Mullowney       }
1116afb2bd1cSJunchao Zhang #endif
1117aa372e3fSPaul Mullowney     }
1118a49f1ed0SStefano Zampini   }
1119a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1120a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1121a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
112228b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
112328b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
112428b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
112528b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
112628b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
112728b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
112828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
112928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1130a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1131a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1132a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
11339566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1134a49f1ed0SStefano Zampini     }
1135a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1136a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1137792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1138a49f1ed0SStefano Zampini 
1139a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1140a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1141a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1142a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
11439371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
11449371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
11459371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
11469566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1147a49f1ed0SStefano Zampini #endif
1148a49f1ed0SStefano Zampini 
11491a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
11501a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
11511a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
11521a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
11531a2c6b5cSJunchao Zhang 
11541a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
11551a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
11561a2c6b5cSJunchao Zhang         */
11579371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1158a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11599371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
11609371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1161a49f1ed0SStefano Zampini #else
11629371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11639371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1164a49f1ed0SStefano Zampini #endif
11651a2c6b5cSJunchao Zhang       } else {
11661a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
11671a2c6b5cSJunchao Zhang       }
11681a2c6b5cSJunchao Zhang 
1169a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1170792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1171a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11729566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1173a49f1ed0SStefano Zampini #endif
1174a49f1ed0SStefano Zampini     }
11759371c9d4SSatish Balay     PetscCallThrust(
11769371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1177a49f1ed0SStefano Zampini   }
11789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
11799566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1180213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1181213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1182aa372e3fSPaul Mullowney   /* assign the pointer */
1183aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
11841a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
11853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1186bda325fcSPaul Mullowney }
1187bda325fcSPaul Mullowney 
1188a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1189d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1190d71ae5a4SJacob Faibussowitsch {
1191c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1192465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1193465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1194465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1195465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1196bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1197aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1198aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1199aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1200bda325fcSPaul Mullowney 
1201bda325fcSPaul Mullowney   PetscFunctionBegin;
1202aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1203aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1205aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1206aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1207bda325fcSPaul Mullowney   }
1208bda325fcSPaul Mullowney 
1209bda325fcSPaul Mullowney   /* Get the GPU pointers */
12109566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12119566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1212c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1213c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1214bda325fcSPaul Mullowney 
12159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1216aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12179371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1218aa372e3fSPaul Mullowney 
1219aa372e3fSPaul Mullowney   /* First, solve U */
12209f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12219f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1222aa372e3fSPaul Mullowney 
1223aa372e3fSPaul Mullowney   /* Then, solve L */
12249f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
12259f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1226aa372e3fSPaul Mullowney 
1227aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
12289371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1229aa372e3fSPaul Mullowney 
1230aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1231a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1232bda325fcSPaul Mullowney 
1233bda325fcSPaul Mullowney   /* restore */
12349566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12359566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
12383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1239bda325fcSPaul Mullowney }
1240bda325fcSPaul Mullowney 
1241d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1242d71ae5a4SJacob Faibussowitsch {
1243465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1244465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1245bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1246aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1247aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1248aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1249bda325fcSPaul Mullowney 
1250bda325fcSPaul Mullowney   PetscFunctionBegin;
1251aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1252aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1254aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1255aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1256bda325fcSPaul Mullowney   }
1257bda325fcSPaul Mullowney 
1258bda325fcSPaul Mullowney   /* Get the GPU pointers */
12599566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12609566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1261bda325fcSPaul Mullowney 
12629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1263aa372e3fSPaul Mullowney   /* First, solve U */
12649f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12659f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1266aa372e3fSPaul Mullowney 
1267aa372e3fSPaul Mullowney   /* Then, solve L */
12689f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
12699f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1270bda325fcSPaul Mullowney 
1271bda325fcSPaul Mullowney   /* restore */
12729566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12739566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12749566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
12763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1277bda325fcSPaul Mullowney }
1278bda325fcSPaul Mullowney 
1279d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1280d71ae5a4SJacob Faibussowitsch {
1281465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1282465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1283465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1284465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
12859ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1286aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1287aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1288aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
12899ae82921SPaul Mullowney 
12909ae82921SPaul Mullowney   PetscFunctionBegin;
1291e057df02SPaul Mullowney   /* Get the GPU pointers */
12929566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12939566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1294c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1295c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
12969ae82921SPaul Mullowney 
12979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1298aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12999371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1300aa372e3fSPaul Mullowney 
1301aa372e3fSPaul Mullowney   /* Next, solve L */
13029f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
13039f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1304aa372e3fSPaul Mullowney 
1305aa372e3fSPaul Mullowney   /* Then, solve U */
13069f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
13079f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1308d49cd2b7SBarry Smith 
13094e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
13109371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
13119ae82921SPaul Mullowney 
13129566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13139566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13163ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13179ae82921SPaul Mullowney }
13189ae82921SPaul Mullowney 
1319d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1320d71ae5a4SJacob Faibussowitsch {
1321465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1322465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
13239ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1324aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1325aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1326aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13279ae82921SPaul Mullowney 
13289ae82921SPaul Mullowney   PetscFunctionBegin;
1329e057df02SPaul Mullowney   /* Get the GPU pointers */
13309566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13319566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
13329ae82921SPaul Mullowney 
13339566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1334aa372e3fSPaul Mullowney   /* First, solve L */
13359f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
13369f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1337d49cd2b7SBarry Smith 
1338aa372e3fSPaul Mullowney   /* Next, solve U */
13399f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
13409f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
13419ae82921SPaul Mullowney 
13429566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13439566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13459566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
13479ae82921SPaul Mullowney }
13489ae82921SPaul Mullowney 
1349da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1350da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1351d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1352d71ae5a4SJacob Faibussowitsch {
1353da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1354da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1355da112707SJunchao Zhang   const PetscScalar            *barray;
1356da112707SJunchao Zhang   PetscScalar                  *xarray;
1357da112707SJunchao Zhang 
1358da112707SJunchao Zhang   PetscFunctionBegin;
1359da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1360da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1361da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1362da112707SJunchao Zhang 
1363da112707SJunchao Zhang   /* Solve L*y = b */
1364da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1365da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
13669371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
13679371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
136812ba2bc6SJunchao Zhang                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1369da112707SJunchao Zhang 
1370da112707SJunchao Zhang   /* Solve U*x = y */
1371da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
13729371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
13739371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1374da112707SJunchao Zhang 
1375da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1376da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1377da112707SJunchao Zhang 
1378da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1379da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
13803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1381da112707SJunchao Zhang }
1382da112707SJunchao Zhang 
1383d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1384d71ae5a4SJacob Faibussowitsch {
1385da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1386da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1387da112707SJunchao Zhang   const PetscScalar            *barray;
1388da112707SJunchao Zhang   PetscScalar                  *xarray;
1389da112707SJunchao Zhang 
1390da112707SJunchao Zhang   PetscFunctionBegin;
139112ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1392da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
139335cb6cd3SPierre Jolivet     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
13949371c9d4SSatish Balay                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1395da112707SJunchao Zhang 
1396da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
13979371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1398da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
139912ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
140012ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
140112ba2bc6SJunchao Zhang   }
1402da112707SJunchao Zhang 
140312ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
14049371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1405da112707SJunchao Zhang 
14069371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
140712ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1408da112707SJunchao Zhang   }
1409da112707SJunchao Zhang 
1410da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1411da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1412da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1413da112707SJunchao Zhang 
1414da112707SJunchao Zhang   /* Solve Ut*y = b */
1415da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1416da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14179371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
14189371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1419da112707SJunchao Zhang 
1420da112707SJunchao Zhang   /* Solve Lt*x = y */
1421da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14229371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
14239371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1424da112707SJunchao Zhang 
1425da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1426da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1427da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1428da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
14293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1430da112707SJunchao Zhang }
1431da112707SJunchao Zhang 
14328eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1433d71ae5a4SJacob Faibussowitsch {
1434da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1435da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1436da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1437da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1438da112707SJunchao Zhang   PetscInt                      m, nz;
1439da112707SJunchao Zhang   PetscBool                     flg;
1440da112707SJunchao Zhang 
1441da112707SJunchao Zhang   PetscFunctionBegin;
1442da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1443da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1444da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1445da112707SJunchao Zhang   }
1446da112707SJunchao Zhang 
1447da112707SJunchao Zhang   /* Copy A's value to fact */
1448da112707SJunchao Zhang   m  = fact->rmap->n;
1449da112707SJunchao Zhang   nz = aij->nz;
1450da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1451da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1452da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1453da112707SJunchao Zhang 
1454da112707SJunchao Zhang   /* Factorize fact inplace */
14559371c9d4SSatish Balay   if (m)
14569371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
14579371c9d4SSatish Balay                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1458da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1459da112707SJunchao Zhang     int              numerical_zero;
1460da112707SJunchao Zhang     cusparseStatus_t status;
1461da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1462da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1463da112707SJunchao Zhang   }
1464da112707SJunchao Zhang 
146512ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
146612ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
146712ba2bc6SJunchao Zhang   */
14689371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1469da112707SJunchao Zhang 
14709371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1471da112707SJunchao Zhang 
147212ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
147312ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
147412ba2bc6SJunchao Zhang 
1475da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1476da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1477da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1478da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1479da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1480da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
14813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1482da112707SJunchao Zhang }
1483da112707SJunchao Zhang 
14848eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1485d71ae5a4SJacob Faibussowitsch {
1486da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1487da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1488da112707SJunchao Zhang   PetscInt                      m, nz;
1489da112707SJunchao Zhang 
1490da112707SJunchao Zhang   PetscFunctionBegin;
1491da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1492da112707SJunchao Zhang     PetscInt  i;
1493da112707SJunchao Zhang     PetscBool flg, missing;
1494da112707SJunchao Zhang 
1495da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1496da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1497da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1498da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1499da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1500da112707SJunchao Zhang   }
1501da112707SJunchao Zhang 
1502da112707SJunchao Zhang   /* Free the old stale stuff */
1503da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1504da112707SJunchao Zhang 
1505da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1506da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1507da112707SJunchao Zhang    */
1508da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1509da112707SJunchao Zhang 
1510da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1511da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1512da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1513da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1514da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1515da112707SJunchao Zhang 
1516da112707SJunchao Zhang   aij->row = NULL;
1517da112707SJunchao Zhang   aij->col = NULL;
1518da112707SJunchao Zhang 
1519da112707SJunchao Zhang   /* ====================================================================== */
1520da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1521da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1522da112707SJunchao Zhang   /* ====================================================================== */
1523da112707SJunchao Zhang   const int *Ai, *Aj;
1524da112707SJunchao Zhang 
1525da112707SJunchao Zhang   m  = fact->rmap->n;
1526da112707SJunchao Zhang   nz = aij->nz;
1527da112707SJunchao Zhang 
1528da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1529da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1530da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1531da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1532da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1533da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1534da112707SJunchao Zhang 
1535da112707SJunchao Zhang   /* ====================================================================== */
1536da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1537da112707SJunchao Zhang   /* ====================================================================== */
1538da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1539da112707SJunchao Zhang   cusparseDiagType_t diagType;
1540da112707SJunchao Zhang 
1541da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1542da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1543da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1544da112707SJunchao Zhang 
1545da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1546da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1547da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1548da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1549da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1550da112707SJunchao Zhang   */
1551da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1552da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
15539371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
15549371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
15559371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1556da112707SJunchao Zhang 
1557da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1558da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
15599371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
15609371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
15619371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1562da112707SJunchao Zhang 
1563da112707SJunchao Zhang   /* ========================================================================= */
1564da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1565da112707SJunchao Zhang   /* ========================================================================= */
1566da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
15679371c9d4SSatish Balay   if (m)
15689371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
15699371c9d4SSatish Balay                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1570da112707SJunchao Zhang 
1571da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1572da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1573da112707SJunchao Zhang 
1574da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1575da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1576da112707SJunchao Zhang 
1577da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
15789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1579da112707SJunchao Zhang 
1580da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
15819371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1582da112707SJunchao Zhang 
1583da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
158412ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
158512ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
158612ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1587da112707SJunchao Zhang    */
158812ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
158912ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
159012ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1591da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
159212ba2bc6SJunchao Zhang   } else {
159312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
159412ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1595da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
159612ba2bc6SJunchao Zhang   }
1597da112707SJunchao Zhang 
1598da112707SJunchao Zhang   /* ========================================================================== */
1599da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1600da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1601da112707SJunchao Zhang   /* ========================================================================== */
1602da112707SJunchao Zhang   int              structural_zero;
1603da112707SJunchao Zhang   cusparseStatus_t status;
1604da112707SJunchao Zhang 
1605da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
16069371c9d4SSatish Balay   if (m)
16079371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16089371c9d4SSatish Balay                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1609da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1610da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1611da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1612da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1613da112707SJunchao Zhang   }
1614da112707SJunchao Zhang 
1615da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
16160dd8c0acSJunchao Zhang   {
1617da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
16180dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1619da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1620da112707SJunchao Zhang 
1621da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1622da112707SJunchao Zhang     Ai    = Aseq->i;
1623da112707SJunchao Zhang     Adiag = Aseq->diag;
1624da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1625da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1626da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1627da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1628da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1629da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1630da112707SJunchao Zhang         */
1631da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1632da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1633da112707SJunchao Zhang       }
1634da112707SJunchao Zhang     }
1635da112707SJunchao Zhang     fs->numericFactFlops = flops;
16360dd8c0acSJunchao Zhang   }
1637da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
16383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1639da112707SJunchao Zhang }
1640da112707SJunchao Zhang 
1641d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1642d71ae5a4SJacob Faibussowitsch {
1643da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1644da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1645da112707SJunchao Zhang   const PetscScalar            *barray;
1646da112707SJunchao Zhang   PetscScalar                  *xarray;
1647da112707SJunchao Zhang 
1648da112707SJunchao Zhang   PetscFunctionBegin;
1649da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1650da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1651da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1652da112707SJunchao Zhang 
1653da112707SJunchao Zhang   /* Solve L*y = b */
1654da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1655da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
16569371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
16579371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1658da112707SJunchao Zhang 
1659da112707SJunchao Zhang   /* Solve Lt*x = y */
1660da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
16619371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
16629371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1663da112707SJunchao Zhang 
1664da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1665da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1666da112707SJunchao Zhang 
1667da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1668da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
16693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1670da112707SJunchao Zhang }
1671da112707SJunchao Zhang 
16728eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1673d71ae5a4SJacob Faibussowitsch {
1674da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1675da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1676da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1677da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1678da112707SJunchao Zhang   PetscInt                      m, nz;
1679da112707SJunchao Zhang   PetscBool                     flg;
1680da112707SJunchao Zhang 
1681da112707SJunchao Zhang   PetscFunctionBegin;
1682da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1683da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1684da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1685da112707SJunchao Zhang   }
1686da112707SJunchao Zhang 
1687da112707SJunchao Zhang   /* Copy A's value to fact */
1688da112707SJunchao Zhang   m  = fact->rmap->n;
1689da112707SJunchao Zhang   nz = aij->nz;
1690da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1691da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1692da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1693da112707SJunchao Zhang 
1694da112707SJunchao Zhang   /* Factorize fact inplace */
1695da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1696da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1697da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1698da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1699da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1700da112707SJunchao Zhang    */
17019371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1702da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1703da112707SJunchao Zhang     int              numerical_zero;
1704da112707SJunchao Zhang     cusparseStatus_t status;
1705da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1706da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1707da112707SJunchao Zhang   }
1708da112707SJunchao Zhang 
17099371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1710da112707SJunchao Zhang 
1711da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1712da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1713da112707SJunchao Zhang   */
17149371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1715da112707SJunchao Zhang 
1716da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1717da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1718da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1719da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1720da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1721da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
17223ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1723da112707SJunchao Zhang }
1724da112707SJunchao Zhang 
17258eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1726d71ae5a4SJacob Faibussowitsch {
1727da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1728da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1729da112707SJunchao Zhang   PetscInt                      m, nz;
1730da112707SJunchao Zhang 
1731da112707SJunchao Zhang   PetscFunctionBegin;
1732da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1733da112707SJunchao Zhang     PetscInt  i;
1734da112707SJunchao Zhang     PetscBool flg, missing;
1735da112707SJunchao Zhang 
1736da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1737da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1738da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1739da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1740da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1741da112707SJunchao Zhang   }
1742da112707SJunchao Zhang 
1743da112707SJunchao Zhang   /* Free the old stale stuff */
1744da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1745da112707SJunchao Zhang 
1746da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1747da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1748da112707SJunchao Zhang    */
1749da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1750da112707SJunchao Zhang 
1751da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1752da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
1753da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1754da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1755da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1756da112707SJunchao Zhang 
1757da112707SJunchao Zhang   aij->row = NULL;
1758da112707SJunchao Zhang   aij->col = NULL;
1759da112707SJunchao Zhang 
1760da112707SJunchao Zhang   /* ====================================================================== */
1761da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1762da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1763da112707SJunchao Zhang   /* ====================================================================== */
1764da112707SJunchao Zhang   const int *Ai, *Aj;
1765da112707SJunchao Zhang 
1766da112707SJunchao Zhang   m  = fact->rmap->n;
1767da112707SJunchao Zhang   nz = aij->nz;
1768da112707SJunchao Zhang 
1769da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1770da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1771da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1772da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1773da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1774da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1775da112707SJunchao Zhang 
1776da112707SJunchao Zhang   /* ====================================================================== */
1777da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
1778da112707SJunchao Zhang   /* ====================================================================== */
1779da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1780da112707SJunchao Zhang   cusparseDiagType_t diagType;
1781da112707SJunchao Zhang 
1782da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1783da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1784da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1785da112707SJunchao Zhang 
1786da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1787da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1788da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1789da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1790da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1791da112707SJunchao Zhang   */
1792da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1793da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
17949371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
17959371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
17969371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1797da112707SJunchao Zhang 
1798da112707SJunchao Zhang   /* ========================================================================= */
1799da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1800da112707SJunchao Zhang   /* ========================================================================= */
1801da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
18029371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1803da112707SJunchao Zhang 
1804da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1805da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1806da112707SJunchao Zhang 
1807da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1808da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1809da112707SJunchao Zhang 
1810da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18119371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1812da112707SJunchao Zhang 
1813da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
18149371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1815da112707SJunchao Zhang 
181612ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
181712ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
181812ba2bc6SJunchao Zhang    */
181912ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
182012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
182112ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1822da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
182312ba2bc6SJunchao Zhang   } else {
182412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
182512ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
182612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
182712ba2bc6SJunchao Zhang   }
1828da112707SJunchao Zhang 
1829da112707SJunchao Zhang   /* ========================================================================== */
1830da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
1831da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
1832da112707SJunchao Zhang   /* ========================================================================== */
1833da112707SJunchao Zhang   int              structural_zero;
1834da112707SJunchao Zhang   cusparseStatus_t status;
1835da112707SJunchao Zhang 
1836da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18379371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1838da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1839da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1840da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1841da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1842da112707SJunchao Zhang   }
1843da112707SJunchao Zhang 
1844da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18450dd8c0acSJunchao Zhang   {
1846da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18470dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
1848da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1849da112707SJunchao Zhang 
1850da112707SJunchao Zhang     Ai = Aseq->i;
1851da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1852da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
1853da112707SJunchao Zhang       if (nzRow > 1) {
1854da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1855da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1856da112707SJunchao Zhang         */
1857da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1858da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1859da112707SJunchao Zhang       }
1860da112707SJunchao Zhang     }
1861da112707SJunchao Zhang     fs->numericFactFlops = flops;
18620dd8c0acSJunchao Zhang   }
1863da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
18643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1865da112707SJunchao Zhang }
1866da112707SJunchao Zhang #endif
1867da112707SJunchao Zhang 
1868d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1869d71ae5a4SJacob Faibussowitsch {
1870da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1871da112707SJunchao Zhang 
1872da112707SJunchao Zhang   PetscFunctionBegin;
1873da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1874bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1875bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
1876da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
1877da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
1878bc996fdcSJunchao Zhang   }
1879da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
1880da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1881da112707SJunchao Zhang   } else
1882da112707SJunchao Zhang #endif
1883da112707SJunchao Zhang   {
1884da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1885da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1886da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1887da112707SJunchao Zhang   }
18883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1889da112707SJunchao Zhang }
1890da112707SJunchao Zhang 
1891d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1892d71ae5a4SJacob Faibussowitsch {
1893da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1894da112707SJunchao Zhang 
1895da112707SJunchao Zhang   PetscFunctionBegin;
1896da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1897da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1898da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
18993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1900da112707SJunchao Zhang }
1901da112707SJunchao Zhang 
1902d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1903d71ae5a4SJacob Faibussowitsch {
1904da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1905da112707SJunchao Zhang 
1906da112707SJunchao Zhang   PetscFunctionBegin;
1907da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1908bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
1909bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1910da112707SJunchao Zhang   if (!info->levels && perm_identity) {
1911da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1912da112707SJunchao Zhang   } else
1913da112707SJunchao Zhang #endif
1914da112707SJunchao Zhang   {
1915da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1916da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1917da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1918da112707SJunchao Zhang   }
19193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1920da112707SJunchao Zhang }
1921da112707SJunchao Zhang 
1922d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1923d71ae5a4SJacob Faibussowitsch {
1924da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1925da112707SJunchao Zhang 
1926da112707SJunchao Zhang   PetscFunctionBegin;
1927da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1928da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1929da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
19303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1931da112707SJunchao Zhang }
1932da112707SJunchao Zhang 
19338eb1d50fSPierre Jolivet PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1934d71ae5a4SJacob Faibussowitsch {
1935841d4cb1SJunchao Zhang   PetscFunctionBegin;
1936841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
19373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1938841d4cb1SJunchao Zhang }
1939841d4cb1SJunchao Zhang 
1940841d4cb1SJunchao Zhang /*MC
1941841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
194211a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1943841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1944841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
194511a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1946841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
1947841d4cb1SJunchao Zhang 
1948841d4cb1SJunchao Zhang   Level: beginner
1949841d4cb1SJunchao Zhang 
19502ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`,
19512ef1f0ffSBarry Smith           `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1952841d4cb1SJunchao Zhang M*/
1953841d4cb1SJunchao Zhang 
1954d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1955d71ae5a4SJacob Faibussowitsch {
1956841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
1957bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
1958bc996fdcSJunchao Zhang   char     *prefix;
1959bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
1960841d4cb1SJunchao Zhang 
1961841d4cb1SJunchao Zhang   PetscFunctionBegin;
1962841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1963841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
1964841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
1965841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1966841d4cb1SJunchao Zhang 
1967bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1968bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1969bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1970bc996fdcSJunchao Zhang   PetscOptionsEnd();
1971bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1972bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1973bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1974bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1975bc996fdcSJunchao Zhang 
1976841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1977841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1978841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1979841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
1980841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1981841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1982841d4cb1SJunchao Zhang     } else {
1983841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1984841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1985841d4cb1SJunchao Zhang     }
1986841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1987841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1988841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1989841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1990841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
1991841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1992841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1993841d4cb1SJunchao Zhang     } else {
1994841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1995841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1996841d4cb1SJunchao Zhang     }
1997841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1998841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1999841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2000841d4cb1SJunchao Zhang 
2001841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2002841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2003841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
20043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2005841d4cb1SJunchao Zhang }
2006841d4cb1SJunchao Zhang 
2007d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2008d71ae5a4SJacob Faibussowitsch {
20097e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
20107e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
20110dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500
2012da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
20130dd8c0acSJunchao Zhang #endif
20147e8381f9SStefano Zampini 
20157e8381f9SStefano Zampini   PetscFunctionBegin;
20167e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
20179566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2018da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2019da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
20209566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2021da112707SJunchao Zhang     }
2022da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500
2023da112707SJunchao Zhang     else if (fs->csrVal) {
2024da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2025da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2026da112707SJunchao Zhang     }
2027da112707SJunchao Zhang #endif
20289371c9d4SSatish Balay     else
20299371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
20309566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
20319566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
20327e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
20337e8381f9SStefano Zampini   }
20343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20357e8381f9SStefano Zampini }
20367e8381f9SStefano Zampini 
2037d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2038d71ae5a4SJacob Faibussowitsch {
20397e8381f9SStefano Zampini   PetscFunctionBegin;
20409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
204167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
20423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
204367a45760SJunchao Zhang }
204467a45760SJunchao Zhang 
2045d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2046d71ae5a4SJacob Faibussowitsch {
204767a45760SJunchao Zhang   PetscFunctionBegin;
20487e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
204967a45760SJunchao Zhang   *array         = NULL;
20503ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
205167a45760SJunchao Zhang }
205267a45760SJunchao Zhang 
2053d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2054d71ae5a4SJacob Faibussowitsch {
205567a45760SJunchao Zhang   PetscFunctionBegin;
20569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
205767a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
20583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
205967a45760SJunchao Zhang }
206067a45760SJunchao Zhang 
20618eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2062d71ae5a4SJacob Faibussowitsch {
206367a45760SJunchao Zhang   PetscFunctionBegin;
206467a45760SJunchao Zhang   *array = NULL;
20653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
206667a45760SJunchao Zhang }
206767a45760SJunchao Zhang 
2068d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2069d71ae5a4SJacob Faibussowitsch {
207067a45760SJunchao Zhang   PetscFunctionBegin;
207167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
20723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
207367a45760SJunchao Zhang }
207467a45760SJunchao Zhang 
2075d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2076d71ae5a4SJacob Faibussowitsch {
207767a45760SJunchao Zhang   PetscFunctionBegin;
207867a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
207967a45760SJunchao Zhang   *array         = NULL;
20803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
20817e8381f9SStefano Zampini }
20827e8381f9SStefano Zampini 
2083d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2084d71ae5a4SJacob Faibussowitsch {
20857ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
20867ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
20877ee59b9bSJunchao Zhang 
20887ee59b9bSJunchao Zhang   PetscFunctionBegin;
20897ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
20907ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
20917ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
20927ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
20937ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
20947ee59b9bSJunchao Zhang 
20957ee59b9bSJunchao Zhang   if (i) {
20967ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
20977ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
20987ee59b9bSJunchao Zhang #else
20997ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21007ee59b9bSJunchao Zhang #endif
21017ee59b9bSJunchao Zhang   }
21027ee59b9bSJunchao Zhang   if (j) {
21037ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21047ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
21057ee59b9bSJunchao Zhang #else
21067ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21077ee59b9bSJunchao Zhang #endif
21087ee59b9bSJunchao Zhang   }
21097ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
21107ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
21113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
21127ee59b9bSJunchao Zhang }
21137ee59b9bSJunchao Zhang 
2114d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2115d71ae5a4SJacob Faibussowitsch {
2116aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
21177c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
21189ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2119213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2120aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2121abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
21229ae82921SPaul Mullowney 
21239ae82921SPaul Mullowney   PetscFunctionBegin;
212428b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2125c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2126a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2127a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2128afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
212985ba7357SStefano Zampini 
213008401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
21319566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2132afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
21339566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
21349566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
21359566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21369566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
213734d6c7a5SJose E. Roman     } else {
2138abb89eb1SStefano Zampini       PetscInt nnz;
21399566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21409566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
21419566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
21427c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
214381902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2144a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2145a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
21469ae82921SPaul Mullowney       try {
21479ae82921SPaul Mullowney         if (a->compressedrow.use) {
21489ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
21499ae82921SPaul Mullowney           ii   = a->compressedrow.i;
21509ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
21519ae82921SPaul Mullowney         } else {
2152213423ffSJunchao Zhang           m    = A->rmap->n;
2153213423ffSJunchao Zhang           ii   = a->i;
2154e6e9a74fSStefano Zampini           ridx = NULL;
21559ae82921SPaul Mullowney         }
215608401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
21579371c9d4SSatish Balay         if (!a->a) {
21589371c9d4SSatish Balay           nnz  = ii[m];
21599371c9d4SSatish Balay           both = PETSC_FALSE;
21609371c9d4SSatish Balay         } else nnz = a->nz;
216108401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
21629ae82921SPaul Mullowney 
216385ba7357SStefano Zampini         /* create cusparse matrix */
2164abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2165aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
21669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
21679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
21689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
21699ae82921SPaul Mullowney 
21709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
21719566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
21729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
21739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2177b06137fdSPaul Mullowney 
2178aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2179aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2180aa372e3fSPaul Mullowney           /* set the matrix */
2181afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2182afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2183afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2184abb89eb1SStefano Zampini           mat->num_entries = nnz;
2185afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2186afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
21879ae82921SPaul Mullowney 
2188abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2189abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2190aa372e3fSPaul Mullowney 
2191abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2192abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2193aa372e3fSPaul Mullowney 
2194aa372e3fSPaul Mullowney           /* assign the pointer */
2195afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2196afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2197afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
21989371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
21999371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
22009371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2201afb2bd1cSJunchao Zhang           }
2202afb2bd1cSJunchao Zhang #endif
2203aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2204afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2205afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2206afb2bd1cSJunchao Zhang #else
2207afb2bd1cSJunchao Zhang           CsrMatrix *mat = new CsrMatrix;
2208afb2bd1cSJunchao Zhang           mat->num_rows = m;
2209afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2210abb89eb1SStefano Zampini           mat->num_entries = nnz;
2211afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2212afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2213aa372e3fSPaul Mullowney 
2214abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2215abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2216aa372e3fSPaul Mullowney 
2217abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2218abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2219aa372e3fSPaul Mullowney 
2220aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
22219566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
22229371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
22239371c9d4SSatish Balay           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
22249371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2225aa372e3fSPaul Mullowney           /* assign the pointer */
2226aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2227aa372e3fSPaul Mullowney 
2228afb2bd1cSJunchao Zhang           if (mat) {
2229afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2230afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2231afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2232afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2233087f3262SPaul Mullowney           }
2234afb2bd1cSJunchao Zhang #endif
2235087f3262SPaul Mullowney         }
2236ca45077fSPaul Mullowney 
2237aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2238213423ffSJunchao Zhang         if (a->compressedrow.use) {
2239213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2240aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2241aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2242213423ffSJunchao Zhang           tmp = m;
2243213423ffSJunchao Zhang         } else {
2244213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2245213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2246213423ffSJunchao Zhang           tmp                        = 0;
2247213423ffSJunchao Zhang         }
22489566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2249aa372e3fSPaul Mullowney 
2250aa372e3fSPaul Mullowney         /* assign the pointer */
2251aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2252d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2253d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2254d71ae5a4SJacob Faibussowitsch       }
22559566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
22569566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
225734d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
225834d6c7a5SJose E. Roman     }
2259abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
22609ae82921SPaul Mullowney   }
22613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
22629ae82921SPaul Mullowney }
22639ae82921SPaul Mullowney 
22649371c9d4SSatish Balay struct VecCUDAPlusEquals {
2265aa372e3fSPaul Mullowney   template <typename Tuple>
2266d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2267d71ae5a4SJacob Faibussowitsch   {
2268aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2269aa372e3fSPaul Mullowney   }
2270aa372e3fSPaul Mullowney };
2271aa372e3fSPaul Mullowney 
22729371c9d4SSatish Balay struct VecCUDAEquals {
22737e8381f9SStefano Zampini   template <typename Tuple>
2274d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2275d71ae5a4SJacob Faibussowitsch   {
22767e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
22777e8381f9SStefano Zampini   }
22787e8381f9SStefano Zampini };
22797e8381f9SStefano Zampini 
22809371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2281e6e9a74fSStefano Zampini   template <typename Tuple>
2282d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2283d71ae5a4SJacob Faibussowitsch   {
2284e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2285e6e9a74fSStefano Zampini   }
2286e6e9a74fSStefano Zampini };
2287e6e9a74fSStefano Zampini 
2288afb2bd1cSJunchao Zhang struct MatMatCusparse {
2289ccdfe979SStefano Zampini   PetscBool      cisdense;
2290ccdfe979SStefano Zampini   PetscScalar   *Bt;
2291ccdfe979SStefano Zampini   Mat            X;
2292fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2293fcdce8c4SStefano Zampini   PetscLogDouble flops;
2294fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2295b4285af6SJunchao Zhang 
2296afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2297fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2298afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2299afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2300afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2301afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2302b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2303b4285af6SJunchao Zhang   void *dBuffer4;
2304b4285af6SJunchao Zhang   void *dBuffer5;
2305b4285af6SJunchao Zhang   #endif
2306fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2307fcdce8c4SStefano Zampini   void                 *mmBuffer;
2308fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2309fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2310afb2bd1cSJunchao Zhang #endif
2311afb2bd1cSJunchao Zhang };
2312ccdfe979SStefano Zampini 
2313d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2314d71ae5a4SJacob Faibussowitsch {
2315ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2316ccdfe979SStefano Zampini 
2317ccdfe979SStefano Zampini   PetscFunctionBegin;
23189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2319fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2320afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
23219566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
23229566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
23239566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
23249566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2325b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
23269566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
23279566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2328b4285af6SJunchao Zhang   #endif
23299566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
23309566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2331afb2bd1cSJunchao Zhang #endif
23329566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
23339566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
23343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2335ccdfe979SStefano Zampini }
2336ccdfe979SStefano Zampini 
2337ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2338ccdfe979SStefano Zampini 
2339d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2340d71ae5a4SJacob Faibussowitsch {
2341ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2342ccdfe979SStefano Zampini   Mat                           A, B;
2343afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2344ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2345ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2346ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2347ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2348ccdfe979SStefano Zampini   const PetscScalar            *barray;
2349ccdfe979SStefano Zampini   PetscScalar                  *carray;
2350ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2351ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2352ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2353ccdfe979SStefano Zampini 
2354ccdfe979SStefano Zampini   PetscFunctionBegin;
2355ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
235628b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2357ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2358ccdfe979SStefano Zampini   A      = product->A;
2359ccdfe979SStefano Zampini   B      = product->B;
23609566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
236128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2362ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2363ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
236428b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
23659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2366ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2367ccdfe979SStefano Zampini   switch (product->type) {
2368ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2369ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2370ccdfe979SStefano Zampini     mat = cusp->mat;
2371ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2372ccdfe979SStefano Zampini     m   = A->rmap->n;
2373ccdfe979SStefano Zampini     n   = B->cmap->n;
2374ccdfe979SStefano Zampini     break;
2375ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
23761a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2377e6e9a74fSStefano Zampini       mat = cusp->mat;
2378e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2379e6e9a74fSStefano Zampini     } else {
23809566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2381ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2382ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2383e6e9a74fSStefano Zampini     }
2384ccdfe979SStefano Zampini     m = A->cmap->n;
2385ccdfe979SStefano Zampini     n = B->cmap->n;
2386ccdfe979SStefano Zampini     break;
2387ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2388ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2389ccdfe979SStefano Zampini     mat = cusp->mat;
2390ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2391ccdfe979SStefano Zampini     m   = A->rmap->n;
2392ccdfe979SStefano Zampini     n   = B->rmap->n;
2393ccdfe979SStefano Zampini     break;
2394d71ae5a4SJacob Faibussowitsch   default:
2395d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2396ccdfe979SStefano Zampini   }
239728b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2398ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2399ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
24009566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
24019566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
2402cd3f9d89SJunchao Zhang   PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr));
2403afb2bd1cSJunchao Zhang 
24049566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2405c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2406cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr));
24079566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2408c8378d12SStefano Zampini   } else {
2409cd3f9d89SJunchao Zhang     PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr));
24109566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2411c8378d12SStefano Zampini   }
2412c8378d12SStefano Zampini 
24139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2414afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2415afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2416a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2417afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2418fcdce8c4SStefano Zampini     size_t mmBufferSize;
24199371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
24209371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
24219371c9d4SSatish Balay       mmdata->matBDescr = NULL;
24229371c9d4SSatish Balay     }
2423afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
24249566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2425afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2426afb2bd1cSJunchao Zhang     }
2427c8378d12SStefano Zampini 
24289371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
24299371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
24309371c9d4SSatish Balay       mmdata->matCDescr = NULL;
24319371c9d4SSatish Balay     }
2432afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
24339566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2434afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2435afb2bd1cSJunchao Zhang     }
2436afb2bd1cSJunchao Zhang 
2437afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
24389371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
24399371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
24409371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2441afb2bd1cSJunchao Zhang     }
24429371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
24439371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2444fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
24459566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
24469566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2447fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2448fcdce8c4SStefano Zampini     }
2449afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2450afb2bd1cSJunchao Zhang   } else {
2451afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
24529566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
24539566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
24549566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2455afb2bd1cSJunchao Zhang   }
2456afb2bd1cSJunchao Zhang 
2457afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
24589371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
24599371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2460afb2bd1cSJunchao Zhang #else
2461afb2bd1cSJunchao Zhang   PetscInt k;
2462afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2463ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2464ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2465ccdfe979SStefano Zampini     cublasStatus_t cerr;
2466ccdfe979SStefano Zampini 
24679566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
24689371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
24699371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2470ccdfe979SStefano Zampini     blda = B->cmap->n;
2471afb2bd1cSJunchao Zhang     k = B->cmap->n;
2472afb2bd1cSJunchao Zhang   } else {
2473afb2bd1cSJunchao Zhang     k = B->rmap->n;
2474ccdfe979SStefano Zampini   }
2475ccdfe979SStefano Zampini 
2476afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
24779371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
24789371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2479afb2bd1cSJunchao Zhang #endif
24809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
24819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
2482cd3f9d89SJunchao Zhang   PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray));
2483ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2484cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
24859566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2486ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2487cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray));
24889566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2489ccdfe979SStefano Zampini   } else {
2490cd3f9d89SJunchao Zhang     PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray));
2491ccdfe979SStefano Zampini   }
249248a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
249348a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
24943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2495ccdfe979SStefano Zampini }
2496ccdfe979SStefano Zampini 
2497d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2498d71ae5a4SJacob Faibussowitsch {
2499ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2500ccdfe979SStefano Zampini   Mat                 A, B;
2501ccdfe979SStefano Zampini   PetscInt            m, n;
2502ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2503ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2504ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2505ccdfe979SStefano Zampini 
2506ccdfe979SStefano Zampini   PetscFunctionBegin;
2507ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
250828b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2509ccdfe979SStefano Zampini   A = product->A;
2510ccdfe979SStefano Zampini   B = product->B;
25119566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
251228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2513ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
251408401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2515ccdfe979SStefano Zampini   switch (product->type) {
2516ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2517ccdfe979SStefano Zampini     m = A->rmap->n;
2518ccdfe979SStefano Zampini     n = B->cmap->n;
2519ccdfe979SStefano Zampini     break;
2520ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2521ccdfe979SStefano Zampini     m = A->cmap->n;
2522ccdfe979SStefano Zampini     n = B->cmap->n;
2523ccdfe979SStefano Zampini     break;
2524ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2525ccdfe979SStefano Zampini     m = A->rmap->n;
2526ccdfe979SStefano Zampini     n = B->rmap->n;
2527ccdfe979SStefano Zampini     break;
2528ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2529ccdfe979SStefano Zampini     m = B->cmap->n;
2530ccdfe979SStefano Zampini     n = B->cmap->n;
2531ccdfe979SStefano Zampini     break;
2532ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2533ccdfe979SStefano Zampini     m = B->rmap->n;
2534ccdfe979SStefano Zampini     n = B->rmap->n;
2535ccdfe979SStefano Zampini     break;
2536d71ae5a4SJacob Faibussowitsch   default:
2537d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2538ccdfe979SStefano Zampini   }
25399566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2540ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
25419566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
25429566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2543ccdfe979SStefano Zampini 
2544ccdfe979SStefano Zampini   /* product data */
25459566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2546ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2547afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2548afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
254948a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2550afb2bd1cSJunchao Zhang #endif
2551ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2552ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
25539566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
25549566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2555ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
25569566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2557ccdfe979SStefano Zampini     } else {
25589566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2559ccdfe979SStefano Zampini     }
2560ccdfe979SStefano Zampini   }
2561ccdfe979SStefano Zampini   C->product->data    = mmdata;
2562ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2563ccdfe979SStefano Zampini 
2564ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
25653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2566ccdfe979SStefano Zampini }
2567ccdfe979SStefano Zampini 
2568d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2569d71ae5a4SJacob Faibussowitsch {
2570ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2571fcdce8c4SStefano Zampini   Mat                           A, B;
2572fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2573fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2574fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2575fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2576fcdce8c4SStefano Zampini   PetscBool                     flg;
2577fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2578fcdce8c4SStefano Zampini   MatProductType                ptype;
2579fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2580fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2581fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2582fcdce8c4SStefano Zampini #endif
2583b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2584ccdfe979SStefano Zampini 
2585ccdfe979SStefano Zampini   PetscFunctionBegin;
2586ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
258728b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
25889566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
258928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2590fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2591fcdce8c4SStefano Zampini   A      = product->A;
2592fcdce8c4SStefano Zampini   B      = product->B;
2593fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2594fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2595fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
259608401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2597fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
259828b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2599fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
260028b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2601fcdce8c4SStefano Zampini     goto finalize;
2602fcdce8c4SStefano Zampini   }
2603fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
26049566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
260528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
26069566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
260728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
260828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
260928b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2610fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2611fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2612fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
261308401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
261408401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
261508401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
26169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
26179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2618fcdce8c4SStefano Zampini 
2619fcdce8c4SStefano Zampini   ptype = product->type;
2620b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2621fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
262228b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2623fa046f9fSJunchao Zhang   }
2624b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2625fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
262628b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2627fa046f9fSJunchao Zhang   }
2628fcdce8c4SStefano Zampini   switch (ptype) {
2629fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2630fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2631fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2632fcdce8c4SStefano Zampini     break;
2633fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2634fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2635fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2636fcdce8c4SStefano Zampini     break;
2637fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2638fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2639fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2640fcdce8c4SStefano Zampini     break;
2641d71ae5a4SJacob Faibussowitsch   default:
2642d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2643fcdce8c4SStefano Zampini   }
2644fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
264528b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
264628b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
264728b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2648fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2649fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2650fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
265128b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
265228b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
265328b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
26549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2655fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2656fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
26579566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2658b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26599371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
26609371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2661b4285af6SJunchao Zhang   #else
26629371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
26639371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
26649371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
26659371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2666b4285af6SJunchao Zhang   #endif
2667fcdce8c4SStefano Zampini #else
26689371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
26699371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
26709371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2671fcdce8c4SStefano Zampini #endif
26729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
26739566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
26749566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2675fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2676fcdce8c4SStefano Zampini finalize:
2677fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
26789566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
26799566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
26809566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2681fcdce8c4SStefano Zampini   c->reallocs = 0;
2682fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2683fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2684fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2685fcdce8c4SStefano Zampini   C->num_ass++;
26863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2687ccdfe979SStefano Zampini }
2688fcdce8c4SStefano Zampini 
2689d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2690d71ae5a4SJacob Faibussowitsch {
2691fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2692fcdce8c4SStefano Zampini   Mat                           A, B;
2693fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2694fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2695fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2696fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2697fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
2698fcdce8c4SStefano Zampini   PetscBool                     flg;
2699fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2700fcdce8c4SStefano Zampini   MatProductType                ptype;
2701fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2702fcdce8c4SStefano Zampini   PetscLogDouble                flops;
2703fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
2704fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2705fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2706fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2707fcdce8c4SStefano Zampini #else
2708fcdce8c4SStefano Zampini   int cnz;
2709fcdce8c4SStefano Zampini #endif
2710b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2711fcdce8c4SStefano Zampini 
2712fcdce8c4SStefano Zampini   PetscFunctionBegin;
2713fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
271428b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2715fcdce8c4SStefano Zampini   A = product->A;
2716fcdce8c4SStefano Zampini   B = product->B;
27179566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
271828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
27199566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
272028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2721fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
2722fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
2723fcdce8c4SStefano Zampini   /* product data */
27249566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2725fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2726fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2727fcdce8c4SStefano Zampini 
27289566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
27299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2730d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2731d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
273208401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
273308401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2734d60bce21SJunchao Zhang 
2735fcdce8c4SStefano Zampini   ptype = product->type;
2736b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2737fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2738fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2739fa046f9fSJunchao Zhang   }
2740b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2741fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2742fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2743fa046f9fSJunchao Zhang   }
2744fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2745fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2746fcdce8c4SStefano Zampini   switch (ptype) {
2747fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2748fcdce8c4SStefano Zampini     m    = A->rmap->n;
2749fcdce8c4SStefano Zampini     n    = B->cmap->n;
2750fcdce8c4SStefano Zampini     k    = A->cmap->n;
2751fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2752fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2753fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2754fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2755fcdce8c4SStefano Zampini     break;
2756fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2757fcdce8c4SStefano Zampini     m = A->cmap->n;
2758fcdce8c4SStefano Zampini     n = B->cmap->n;
2759fcdce8c4SStefano Zampini     k = A->rmap->n;
27609566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2761fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2762fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2763fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2764fcdce8c4SStefano Zampini     break;
2765fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2766fcdce8c4SStefano Zampini     m = A->rmap->n;
2767fcdce8c4SStefano Zampini     n = B->rmap->n;
2768fcdce8c4SStefano Zampini     k = A->cmap->n;
27699566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2770fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2771fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2772fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2773fcdce8c4SStefano Zampini     break;
2774d71ae5a4SJacob Faibussowitsch   default:
2775d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2776fcdce8c4SStefano Zampini   }
2777fcdce8c4SStefano Zampini 
2778fcdce8c4SStefano Zampini   /* create cusparse matrix */
27799566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
27809566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2781fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
2782fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2783fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2784fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2785fcdce8c4SStefano Zampini 
2786fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2787fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2788fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
27899566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
27909566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2791fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2792fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2793fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2794fcdce8c4SStefano Zampini   } else {
2795fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2796fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2797fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2798fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2799fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2800fcdce8c4SStefano Zampini   }
2801fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2802fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
2803fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
2804fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2805fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2806fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
28079566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
28089566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
28099566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
28109566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
28119566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
28129566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
28139566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2816fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2817fcdce8c4SStefano Zampini     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2818fcdce8c4SStefano Zampini     c->nz                = 0;
2819fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2820fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
2821fcdce8c4SStefano Zampini     goto finalizesym;
2822fcdce8c4SStefano Zampini   }
2823fcdce8c4SStefano Zampini 
282428b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
282528b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2826fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2827fcdce8c4SStefano Zampini   if (!biscompressed) {
2828fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
2829fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2830fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2831fcdce8c4SStefano Zampini #endif
2832fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2833fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2834fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
2835fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2836fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2837fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2838fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2839fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2840fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2841fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2842fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
28439566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2844fcdce8c4SStefano Zampini     }
2845fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2846fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
2847fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2848fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
28499371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28509371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2851fcdce8c4SStefano Zampini     }
2852fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2853fcdce8c4SStefano Zampini #endif
2854fcdce8c4SStefano Zampini   }
285528b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
285628b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2857fcdce8c4SStefano Zampini   /* precompute flops count */
2858fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2859fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2860fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2861fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
2862fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
2863fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2864fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2865fcdce8c4SStefano Zampini       }
2866fcdce8c4SStefano Zampini     }
2867fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2868fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2869fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
2870fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2871fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
2872fcdce8c4SStefano Zampini     }
2873fcdce8c4SStefano Zampini   } else { /* TODO */
2874fcdce8c4SStefano Zampini     flops = 0.;
2875fcdce8c4SStefano Zampini   }
2876fcdce8c4SStefano Zampini 
2877fcdce8c4SStefano Zampini   mmdata->flops = flops;
28789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2879b4285af6SJunchao Zhang 
2880fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28819566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
28829371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28839371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
28849566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2885b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2886b4285af6SJunchao Zhang   {
2887b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2888b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2889b4285af6SJunchao Zhang   */
2890b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
2891b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
2892b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
2893b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2894b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
2895b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
2896b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
2897b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
2898b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
2899b4285af6SJunchao Zhang 
2900b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
29019371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
29029371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29039566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2904b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
29059371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
29069371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2907b4285af6SJunchao Zhang 
29089371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
29099371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29109566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
29119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
29129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
29139371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
29149371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
29169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
2917b4285af6SJunchao Zhang 
2918b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
29199566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2920b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
2921b4285af6SJunchao Zhang     /* allocate matrix C */
29229371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29239371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29249371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
29259371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2926b4285af6SJunchao Zhang     /* update matC with the new pointers */
29279371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29289371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2929b4285af6SJunchao Zhang 
29309371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
29319371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
29339371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
29349371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
29369371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29379371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29389566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2939b4285af6SJunchao Zhang   }
2940ae37ee31SJunchao Zhang   #else
2941b4285af6SJunchao Zhang   size_t bufSize2;
2942fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
29439371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
29449371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29459566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2946fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
29479371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
29489371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2949fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
29509371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
29519371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2952fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2953fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2954fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2955fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2956fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
29579566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2958fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
29599371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29609371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2961fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
29629566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2963fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
29649371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
29659371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
2966fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29679566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2968fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
29699566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29709371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29719371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29729371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29739371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2974ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2975fcdce8c4SStefano Zampini #else
29769566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
29779371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29789371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
29799371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2980fcdce8c4SStefano Zampini   c->nz = cnz;
2981fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29829566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2983fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
29849566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2985fcdce8c4SStefano Zampini 
29869566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2987fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2988fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2989fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
29909371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29919371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
29929371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2993fcdce8c4SStefano Zampini #endif
29949566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
29959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2996fcdce8c4SStefano Zampini finalizesym:
2997fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2998fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2999fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
30009566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
30019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3002fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3003fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3004fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3005fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3006fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3007fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3008fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30099566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30109566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3011fcdce8c4SStefano Zampini   } else {
3012fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3013fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3016fcdce8c4SStefano Zampini   }
3017fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3018fcdce8c4SStefano Zampini     PetscInt r = 0;
3019fcdce8c4SStefano Zampini     c->i[0]    = 0;
3020fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3021fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3022fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3023fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3024fcdce8c4SStefano Zampini     }
3025fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3026fcdce8c4SStefano Zampini   }
30279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
30289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
30299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3030fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3031fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3032fcdce8c4SStefano Zampini   c->rmax          = 0;
3033fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3034fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3035fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3036fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3037fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3038fcdce8c4SStefano Zampini   }
30399566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
30409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3041fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3042fcdce8c4SStefano Zampini 
3043fcdce8c4SStefano Zampini   C->nonzerostate++;
30449566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
30459566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3046fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3047fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3048fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3049fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3050fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3051abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3052fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3053fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3054fcdce8c4SStefano Zampini   }
3055fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
30563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3057fcdce8c4SStefano Zampini }
3058fcdce8c4SStefano Zampini 
3059fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3060fcdce8c4SStefano Zampini 
3061fcdce8c4SStefano Zampini /* handles sparse or dense B */
3062d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3063d71ae5a4SJacob Faibussowitsch {
3064fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3065fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3066fcdce8c4SStefano Zampini 
3067fcdce8c4SStefano Zampini   PetscFunctionBegin;
3068fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
30699566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
307048a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3071fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3072fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
307348a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3074fcdce8c4SStefano Zampini   }
307565e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
307665e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
307765e4b4d4SStefano Zampini     switch (product->type) {
307865e4b4d4SStefano Zampini     case MATPRODUCT_AB:
307965e4b4d4SStefano Zampini       if (product->api_user) {
3080d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
30819566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3082d0609cedSBarry Smith         PetscOptionsEnd();
308365e4b4d4SStefano Zampini       } else {
3084d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
30859566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3086d0609cedSBarry Smith         PetscOptionsEnd();
308765e4b4d4SStefano Zampini       }
308865e4b4d4SStefano Zampini       break;
308965e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
309065e4b4d4SStefano Zampini       if (product->api_user) {
3091d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
30929566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3093d0609cedSBarry Smith         PetscOptionsEnd();
309465e4b4d4SStefano Zampini       } else {
3095d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
30969566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3097d0609cedSBarry Smith         PetscOptionsEnd();
309865e4b4d4SStefano Zampini       }
309965e4b4d4SStefano Zampini       break;
310065e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
310165e4b4d4SStefano Zampini       if (product->api_user) {
3102d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
31039566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3104d0609cedSBarry Smith         PetscOptionsEnd();
310565e4b4d4SStefano Zampini       } else {
3106d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
31079566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3108d0609cedSBarry Smith         PetscOptionsEnd();
310965e4b4d4SStefano Zampini       }
311065e4b4d4SStefano Zampini       break;
311165e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
311265e4b4d4SStefano Zampini       if (product->api_user) {
3113d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
31149566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3115d0609cedSBarry Smith         PetscOptionsEnd();
311665e4b4d4SStefano Zampini       } else {
3117d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
31189566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3119d0609cedSBarry Smith         PetscOptionsEnd();
312065e4b4d4SStefano Zampini       }
312165e4b4d4SStefano Zampini       break;
312265e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
312365e4b4d4SStefano Zampini       if (product->api_user) {
3124d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
31259566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3126d0609cedSBarry Smith         PetscOptionsEnd();
312765e4b4d4SStefano Zampini       } else {
3128d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
31299566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3130d0609cedSBarry Smith         PetscOptionsEnd();
313165e4b4d4SStefano Zampini       }
313265e4b4d4SStefano Zampini       break;
3133d71ae5a4SJacob Faibussowitsch     default:
3134d71ae5a4SJacob Faibussowitsch       break;
313565e4b4d4SStefano Zampini     }
313665e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
313765e4b4d4SStefano Zampini   }
313865e4b4d4SStefano Zampini   /* dispatch */
3139fcdce8c4SStefano Zampini   if (isdense) {
3140ccdfe979SStefano Zampini     switch (product->type) {
3141ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3142ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3143ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3144ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3145ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3146fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
31479566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3148fcdce8c4SStefano Zampini       } else {
3149fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3150fcdce8c4SStefano Zampini       }
3151fcdce8c4SStefano Zampini       break;
3152d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3153d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3154d71ae5a4SJacob Faibussowitsch       break;
3155d71ae5a4SJacob Faibussowitsch     default:
3156d71ae5a4SJacob Faibussowitsch       break;
3157ccdfe979SStefano Zampini     }
3158fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3159fcdce8c4SStefano Zampini     switch (product->type) {
3160fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3161fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3162d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3163d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3164d71ae5a4SJacob Faibussowitsch       break;
3165fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3166fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3167d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3168d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3169d71ae5a4SJacob Faibussowitsch       break;
3170d71ae5a4SJacob Faibussowitsch     default:
3171d71ae5a4SJacob Faibussowitsch       break;
3172fcdce8c4SStefano Zampini     }
3173fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
31749566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3175fcdce8c4SStefano Zampini   }
31763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3177ccdfe979SStefano Zampini }
3178ccdfe979SStefano Zampini 
3179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3180d71ae5a4SJacob Faibussowitsch {
31819ae82921SPaul Mullowney   PetscFunctionBegin;
31829566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
31833ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3184e6e9a74fSStefano Zampini }
3185e6e9a74fSStefano Zampini 
3186d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3187d71ae5a4SJacob Faibussowitsch {
3188e6e9a74fSStefano Zampini   PetscFunctionBegin;
31899566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
31903ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3191e6e9a74fSStefano Zampini }
3192e6e9a74fSStefano Zampini 
3193d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3194d71ae5a4SJacob Faibussowitsch {
3195e6e9a74fSStefano Zampini   PetscFunctionBegin;
31969566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
31973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3198e6e9a74fSStefano Zampini }
3199e6e9a74fSStefano Zampini 
3200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3201d71ae5a4SJacob Faibussowitsch {
3202e6e9a74fSStefano Zampini   PetscFunctionBegin;
32039566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
32043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
32059ae82921SPaul Mullowney }
32069ae82921SPaul Mullowney 
3207d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3208d71ae5a4SJacob Faibussowitsch {
3209ca45077fSPaul Mullowney   PetscFunctionBegin;
32109566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
32113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3212ca45077fSPaul Mullowney }
3213ca45077fSPaul Mullowney 
3214d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3215d71ae5a4SJacob Faibussowitsch {
3216a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3217a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3218a0e72f99SJunchao Zhang }
3219a0e72f99SJunchao Zhang 
3220afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3221d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3222d71ae5a4SJacob Faibussowitsch {
32239ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3224aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
32259ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3226e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3227e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3228e6e9a74fSStefano Zampini   PetscBool                     compressed;
3229afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3230afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3231afb2bd1cSJunchao Zhang #endif
32326e111a19SKarl Rupp 
32339ae82921SPaul Mullowney   PetscFunctionBegin;
323408401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3235cbc6b225SStefano Zampini   if (!a->nz) {
3236*995bce04SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz));
3237*995bce04SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::Set(zz, 0));
32383ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
3239e6e9a74fSStefano Zampini   }
324034d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
32419566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3242e6e9a74fSStefano Zampini   if (!trans) {
32439ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
32445f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3245e6e9a74fSStefano Zampini   } else {
32461a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3247e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3248e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3249e6e9a74fSStefano Zampini     } else {
32509566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3251e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3252e6e9a74fSStefano Zampini     }
3253e6e9a74fSStefano Zampini   }
3254e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3255e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3256213423ffSJunchao Zhang 
3257e6e9a74fSStefano Zampini   try {
32589566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
325969d47153SPierre Jolivet     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */
32609566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3261afb2bd1cSJunchao Zhang 
32629566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3263e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3264afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3265afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3266afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3267afb2bd1cSJunchao Zhang       */
3268e6e9a74fSStefano Zampini       xptr = xarray;
3269afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3270213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3271afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3272afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3273afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3274afb2bd1cSJunchao Zhang        */
3275afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3276afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3277afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3278afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3279afb2bd1cSJunchao Zhang       }
3280afb2bd1cSJunchao Zhang #endif
3281e6e9a74fSStefano Zampini     } else {
3282afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3283afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3284afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3285afb2bd1cSJunchao Zhang        */
3286afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3287e6e9a74fSStefano Zampini       dptr = zarray;
3288e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3289afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3290e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3291d0967f54SJacob Faibussowitsch 
3292d0967f54SJacob Faibussowitsch         thrust::for_each(
3293d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3294d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3295d0967f54SJacob Faibussowitsch #endif
3296d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
32979371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3298e6e9a74fSStefano Zampini       }
3299afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3300afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3301afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3302afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3303afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3304afb2bd1cSJunchao Zhang       }
3305afb2bd1cSJunchao Zhang #endif
3306e6e9a74fSStefano Zampini     }
33079ae82921SPaul Mullowney 
3308afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3309aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3310afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
33115f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3312afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
33139566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
33149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
33159371c9d4SSatish Balay         PetscCallCUSPARSE(
33169371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
33179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3318afb2bd1cSJunchao Zhang 
3319afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3320afb2bd1cSJunchao Zhang       } else {
3321afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
33229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
33239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3324afb2bd1cSJunchao Zhang       }
3325afb2bd1cSJunchao Zhang 
33269371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
33279371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3328afb2bd1cSJunchao Zhang #else
33297656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
33309371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3331afb2bd1cSJunchao Zhang #endif
3332aa372e3fSPaul Mullowney     } else {
3333213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3334afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3335afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3336afb2bd1cSJunchao Zhang #else
3337301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
33389371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3339afb2bd1cSJunchao Zhang #endif
3340a65300a6SPaul Mullowney       }
3341aa372e3fSPaul Mullowney     }
33429566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3343aa372e3fSPaul Mullowney 
3344e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3345213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3346213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3347*995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::Copy(yy, zz));      /* zz = yy */
3348e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3349*995bce04SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
33507656d835SStefano Zampini         }
3351213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3352*995bce04SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::Set(zz, 0));
33537656d835SStefano Zampini       }
33547656d835SStefano Zampini 
3355213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3356213423ffSJunchao Zhang       if (compressed) {
33579566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3358da81f932SPierre Jolivet         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registered)
3359a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3360a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3361a0e72f99SJunchao Zhang          */
3362a0e72f99SJunchao Zhang #if 0
3363a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3364a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3365a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3366e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3367c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3368a0e72f99SJunchao Zhang #else
3369a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3370a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3371a0e72f99SJunchao Zhang #endif
33729566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3373e6e9a74fSStefano Zampini       }
3374e6e9a74fSStefano Zampini     } else {
3375*995bce04SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */
3376e6e9a74fSStefano Zampini     }
33779566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
33789566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
33799566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3380d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3381d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3382d71ae5a4SJacob Faibussowitsch   }
3383e6e9a74fSStefano Zampini   if (yy) {
33849566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3385e6e9a74fSStefano Zampini   } else {
33869566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3387e6e9a74fSStefano Zampini   }
33883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
33899ae82921SPaul Mullowney }
33909ae82921SPaul Mullowney 
3391d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3392d71ae5a4SJacob Faibussowitsch {
3393ca45077fSPaul Mullowney   PetscFunctionBegin;
33949566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
33953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3396ca45077fSPaul Mullowney }
3397ca45077fSPaul Mullowney 
3398d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3399d71ae5a4SJacob Faibussowitsch {
3400042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3401042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
34023fa6b06aSMark Adams 
3403042217e8SBarry Smith   PetscFunctionBegin;
34049566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3405042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
34069566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
34079566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3408042217e8SBarry Smith     cusp->deviceMat = NULL;
3409042217e8SBarry Smith   }
34103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
34119ae82921SPaul Mullowney }
34129ae82921SPaul Mullowney 
3413e057df02SPaul Mullowney /*@
341411a5261eSBarry Smith    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3415e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
341611a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3417e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3418e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3419e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
34209ae82921SPaul Mullowney 
3421d083f849SBarry Smith    Collective
34229ae82921SPaul Mullowney 
34239ae82921SPaul Mullowney    Input Parameters:
342411a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
34259ae82921SPaul Mullowney .  m - number of rows
34269ae82921SPaul Mullowney .  n - number of columns
34279ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
34289ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
34292ef1f0ffSBarry Smith          (possibly different for each row) or `NULL`
34309ae82921SPaul Mullowney 
34319ae82921SPaul Mullowney    Output Parameter:
34329ae82921SPaul Mullowney .  A - the matrix
34339ae82921SPaul Mullowney 
34342ef1f0ffSBarry Smith    Level: intermediate
34352ef1f0ffSBarry Smith 
34362ef1f0ffSBarry Smith    Notes:
343711a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
34389ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
343911a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
34409ae82921SPaul Mullowney 
34412ef1f0ffSBarry Smith    If `nnz` is given then `nz` is ignored
34429ae82921SPaul Mullowney 
344311a5261eSBarry Smith    The AIJ format, also called
34442ef1f0ffSBarry Smith    compressed row storage, is fully compatible with standard Fortran
34459ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
34469ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
34479ae82921SPaul Mullowney 
34489ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
34492ef1f0ffSBarry Smith    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
34509ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
34519ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
34529ae82921SPaul Mullowney 
34539ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
34549ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
34559ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
34569ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
34579ae82921SPaul Mullowney 
34582ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
34599ae82921SPaul Mullowney @*/
3460d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3461d71ae5a4SJacob Faibussowitsch {
34629ae82921SPaul Mullowney   PetscFunctionBegin;
34639566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
34649566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
34659566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
34669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
34673ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
34689ae82921SPaul Mullowney }
34699ae82921SPaul Mullowney 
3470d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3471d71ae5a4SJacob Faibussowitsch {
34729ae82921SPaul Mullowney   PetscFunctionBegin;
34739ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
34749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
34759ae82921SPaul Mullowney   } else {
34769566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3477aa372e3fSPaul Mullowney   }
34789566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
34799566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
34809566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
34819566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
34829566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
34839566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
34849566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
34859566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
34869566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
34879566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
34889566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
34893ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
34909ae82921SPaul Mullowney }
34919ae82921SPaul Mullowney 
3492ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
349395639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3494d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3495d71ae5a4SJacob Faibussowitsch {
34969ff858a8SKarl Rupp   PetscFunctionBegin;
34979566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
34989566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
34993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
35009ff858a8SKarl Rupp }
35019ff858a8SKarl Rupp 
3502d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3503d71ae5a4SJacob Faibussowitsch {
3504a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3505039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3506039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3507039c6fbaSStefano Zampini   PetscScalar        *ay;
3508039c6fbaSStefano Zampini   const PetscScalar  *ax;
3509039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3510e6e9a74fSStefano Zampini 
351195639643SRichard Tran Mills   PetscFunctionBegin;
3512a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3513a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3514039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
35159566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35169566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
35173ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
351895639643SRichard Tran Mills   }
3519039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
35209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
35219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
35225f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
35235f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3524039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3525039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3526039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3527039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3528039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3529ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3530039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3531039c6fbaSStefano Zampini   }
3532d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3533d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3534039c6fbaSStefano Zampini 
3535039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3536039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3537039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3538039c6fbaSStefano Zampini     size_t bufferSize;
3539039c6fbaSStefano Zampini     void  *buffer;
3540039c6fbaSStefano Zampini #endif
3541039c6fbaSStefano Zampini 
35429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35439566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35449566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3545039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
35469371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35479371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
35489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
35499566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35509371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35519371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
35529566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35539566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3555039c6fbaSStefano Zampini #else
35569566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35579371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35589371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
35599566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35609566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3561039c6fbaSStefano Zampini #endif
35629566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
35639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3566039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3567a587d139SMark     cublasHandle_t cublasv2handle;
3568a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3569039c6fbaSStefano Zampini 
35709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35729566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
35739566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
35749566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35759566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
35769566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
35779566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35809566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3581039c6fbaSStefano Zampini   } else {
35829566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35839566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3584a587d139SMark   }
35853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
358695639643SRichard Tran Mills }
358795639643SRichard Tran Mills 
3588d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3589d71ae5a4SJacob Faibussowitsch {
359033c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
359133c9ba73SStefano Zampini   PetscScalar   *ay;
359233c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
359333c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
359433c9ba73SStefano Zampini 
359533c9ba73SStefano Zampini   PetscFunctionBegin;
35969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35979566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
35989566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
35999566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
36009566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
36019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
36029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
36039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
36053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
360633c9ba73SStefano Zampini }
360733c9ba73SStefano Zampini 
3608d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3609d71ae5a4SJacob Faibussowitsch {
36107e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3611a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
36127e8381f9SStefano Zampini 
36133fa6b06aSMark Adams   PetscFunctionBegin;
36143fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
36153fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
36167e8381f9SStefano Zampini     if (spptr->mat) {
36177e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
36187e8381f9SStefano Zampini       if (matrix->values) {
36197e8381f9SStefano Zampini         both = PETSC_TRUE;
36207e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36217e8381f9SStefano Zampini       }
36227e8381f9SStefano Zampini     }
36237e8381f9SStefano Zampini     if (spptr->matTranspose) {
36247e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3625ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36267e8381f9SStefano Zampini     }
36273fa6b06aSMark Adams   }
36289566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
36299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
36307e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3631a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
36323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
36333fa6b06aSMark Adams }
36343fa6b06aSMark Adams 
3635d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3636d71ae5a4SJacob Faibussowitsch {
3637a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3638a587d139SMark 
3639a587d139SMark   PetscFunctionBegin;
36409a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
36419a14fc28SStefano Zampini     A->boundtocpu = flg;
36423ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
36439a14fc28SStefano Zampini   }
3644a587d139SMark   if (flg) {
36459566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3646a587d139SMark 
364733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3648a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3649a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3650a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3651a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3652a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3653a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3654a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3655a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3656fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
36579566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
36589566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
36599566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
36609566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
36619566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
36629566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
36639566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3664a587d139SMark   } else {
366533c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3666a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3667a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3668a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3669a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3670a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3671a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3672a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3673a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3674fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
367567a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
367667a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
367767a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
367867a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
367967a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
368067a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
36817ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
36827ee59b9bSJunchao Zhang 
36839566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
36849566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36859566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36869566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
36879566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
36889566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3689a587d139SMark   }
3690a587d139SMark   A->boundtocpu = flg;
3691ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3692ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3693ea500dcfSRichard Tran Mills   } else {
3694ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3695ea500dcfSRichard Tran Mills   }
36963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3697a587d139SMark }
3698a587d139SMark 
36998eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3700d71ae5a4SJacob Faibussowitsch {
370149735bf3SStefano Zampini   Mat B;
37029ae82921SPaul Mullowney 
37039ae82921SPaul Mullowney   PetscFunctionBegin;
37049566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
370549735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
37069566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
370749735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
37089566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
370949735bf3SStefano Zampini   }
371049735bf3SStefano Zampini   B = *newmat;
371149735bf3SStefano Zampini 
37129566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
37139566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
371434136279SStefano Zampini 
371549735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
37169ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3717e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
37189566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37199566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37209566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
37211a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
3722d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3723ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
3724a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3725a435da06SStefano Zampini   #else
3726d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3727a435da06SStefano Zampini   #endif
3728d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3729d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3730d8132acaSStefano Zampini #endif
37311a2c6b5cSJunchao Zhang       B->spptr = spptr;
37329ae82921SPaul Mullowney     } else {
3733e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3734e6e9a74fSStefano Zampini 
37359566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37369566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37379566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3738e6e9a74fSStefano Zampini       B->spptr = spptr;
37399ae82921SPaul Mullowney     }
3740e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
374149735bf3SStefano Zampini   }
3742693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
37439ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
37441a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
37459ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
374695639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3747693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
37482205254eSKarl Rupp 
37499566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
37509566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
37519566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3752ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
37539566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3754ae48a8d0SStefano Zampini #endif
37559566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
37563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
37579ae82921SPaul Mullowney }
37589ae82921SPaul Mullowney 
3759d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3760d71ae5a4SJacob Faibussowitsch {
376102fe1965SBarry Smith   PetscFunctionBegin;
37629566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
37639566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
37643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
376502fe1965SBarry Smith }
376602fe1965SBarry Smith 
37673ca39a21SBarry Smith /*MC
3768e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3769e057df02SPaul Mullowney 
377011a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
377111a5261eSBarry Smith    CSR, ELL, or Hybrid format.
377211a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3773e057df02SPaul Mullowney 
3774e057df02SPaul Mullowney    Options Database Keys:
377511a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
37762ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`).
37772ef1f0ffSBarry Smith                                       Other options include ell (ellpack) or hyb (hybrid).
37782ef1f0ffSBarry Smith .  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid).
37792ef1f0ffSBarry Smith -  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3780e057df02SPaul Mullowney 
3781e057df02SPaul Mullowney   Level: beginner
3782e057df02SPaul Mullowney 
37832ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3784e057df02SPaul Mullowney M*/
37857f756511SDominic Meiser 
3786bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
37870f39cd5aSBarry Smith 
3788d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3789d71ae5a4SJacob Faibussowitsch {
379042c9c57cSBarry Smith   PetscFunctionBegin;
37919566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
37929566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
37939566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
37949566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
37959566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3796bddcd29dSMark Adams 
37973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
379842c9c57cSBarry Smith }
379929b38603SBarry Smith 
3800d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3801d71ae5a4SJacob Faibussowitsch {
3802cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3803cbc6b225SStefano Zampini 
3804cbc6b225SStefano Zampini   PetscFunctionBegin;
38053ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
3806cbc6b225SStefano Zampini   delete cusp->cooPerm;
3807cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3808cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
3809cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3810cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
38119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
38129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3813cbc6b225SStefano Zampini   }
3814cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
38153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3816cbc6b225SStefano Zampini }
3817cbc6b225SStefano Zampini 
3818d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3819d71ae5a4SJacob Faibussowitsch {
38207f756511SDominic Meiser   PetscFunctionBegin;
38217f756511SDominic Meiser   if (*cusparsestruct) {
38229566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
38239566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
38247f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
382581902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
38267e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
38277e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3828a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
38299566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
38309566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
38319566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
38329566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
38337f756511SDominic Meiser   }
38343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38357f756511SDominic Meiser }
38367f756511SDominic Meiser 
3837d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3838d71ae5a4SJacob Faibussowitsch {
38397f756511SDominic Meiser   PetscFunctionBegin;
38407f756511SDominic Meiser   if (*mat) {
38417f756511SDominic Meiser     delete (*mat)->values;
38427f756511SDominic Meiser     delete (*mat)->column_indices;
38437f756511SDominic Meiser     delete (*mat)->row_offsets;
38447f756511SDominic Meiser     delete *mat;
38457f756511SDominic Meiser     *mat = 0;
38467f756511SDominic Meiser   }
38473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38487f756511SDominic Meiser }
38497f756511SDominic Meiser 
3850d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3851d71ae5a4SJacob Faibussowitsch {
38527f756511SDominic Meiser   PetscFunctionBegin;
38537f756511SDominic Meiser   if (*trifactor) {
38549566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3855261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
38569566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
38579566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
38589566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3859afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38609566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3861afb2bd1cSJunchao Zhang #endif
38629566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
38637f756511SDominic Meiser   }
38643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
38657f756511SDominic Meiser }
38667f756511SDominic Meiser 
3867d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3868d71ae5a4SJacob Faibussowitsch {
38697f756511SDominic Meiser   CsrMatrix *mat;
38707f756511SDominic Meiser 
38717f756511SDominic Meiser   PetscFunctionBegin;
38727f756511SDominic Meiser   if (*matstruct) {
38737f756511SDominic Meiser     if ((*matstruct)->mat) {
38747f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3875afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3876afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3877afb2bd1cSJunchao Zhang #else
38787f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
38799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3880afb2bd1cSJunchao Zhang #endif
38817f756511SDominic Meiser       } else {
38827f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
38833ba16761SJacob Faibussowitsch         PetscCall(CsrMatrix_Destroy(&mat));
38847f756511SDominic Meiser       }
38857f756511SDominic Meiser     }
38869566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
38877f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
38889566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
38899566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
38909566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3891afb2bd1cSJunchao Zhang 
3892afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3893afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
38949566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3895afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
3896afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
38979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
38989566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
38999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3900afb2bd1cSJunchao Zhang       }
3901afb2bd1cSJunchao Zhang     }
3902afb2bd1cSJunchao Zhang #endif
39037f756511SDominic Meiser     delete *matstruct;
39047e8381f9SStefano Zampini     *matstruct = NULL;
39057f756511SDominic Meiser   }
39063ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39077f756511SDominic Meiser }
39087f756511SDominic Meiser 
3909d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3910d71ae5a4SJacob Faibussowitsch {
3911da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3912da112707SJunchao Zhang 
39137f756511SDominic Meiser   PetscFunctionBegin;
3914da112707SJunchao Zhang   if (fs) {
3915da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3916da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3917da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3918da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3919da112707SJunchao Zhang     delete fs->rpermIndices;
3920da112707SJunchao Zhang     delete fs->cpermIndices;
3921da112707SJunchao Zhang     delete fs->workVector;
3922da112707SJunchao Zhang     fs->rpermIndices = NULL;
3923da112707SJunchao Zhang     fs->cpermIndices = NULL;
3924da112707SJunchao Zhang     fs->workVector   = NULL;
3925da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3926da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3927da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
3928da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
3929da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3930da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
3931da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
3932da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
3933da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
393412ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3935da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3936da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
393712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3938da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3939da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3940da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3941da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3942da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3943da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3944da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3945da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3946da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3947da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3948da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3949da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
395012ba2bc6SJunchao Zhang 
395112ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
395212ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3953da112707SJunchao Zhang #endif
3954ccdfe979SStefano Zampini   }
39553ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3956ccdfe979SStefano Zampini }
3957ccdfe979SStefano Zampini 
3958d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3959d71ae5a4SJacob Faibussowitsch {
3960ccdfe979SStefano Zampini   PetscFunctionBegin;
3961ccdfe979SStefano Zampini   if (*trifactors) {
39629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3963f0173cd6SStefano Zampini     PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle));
39649566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
39657f756511SDominic Meiser   }
39663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
39677f756511SDominic Meiser }
39687e8381f9SStefano Zampini 
39699371c9d4SSatish Balay struct IJCompare {
3970d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3971d71ae5a4SJacob Faibussowitsch   {
39727e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
39737e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
39747e8381f9SStefano Zampini     return false;
39757e8381f9SStefano Zampini   }
39767e8381f9SStefano Zampini };
39777e8381f9SStefano Zampini 
39789371c9d4SSatish Balay struct IJEqual {
3979d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3980d71ae5a4SJacob Faibussowitsch   {
39817e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
39827e8381f9SStefano Zampini     return true;
39837e8381f9SStefano Zampini   }
39847e8381f9SStefano Zampini };
39857e8381f9SStefano Zampini 
39869371c9d4SSatish Balay struct IJDiff {
39879371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
39887e8381f9SStefano Zampini };
39897e8381f9SStefano Zampini 
39909371c9d4SSatish Balay struct IJSum {
39919371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
39927e8381f9SStefano Zampini };
39937e8381f9SStefano Zampini 
39947e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3995219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3996d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
3997d71ae5a4SJacob Faibussowitsch {
39987e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3999fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4000bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
400108391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
40027e8381f9SStefano Zampini   CsrMatrix                            *matrix;
40037e8381f9SStefano Zampini   PetscInt                              n;
40047e8381f9SStefano Zampini 
40057e8381f9SStefano Zampini   PetscFunctionBegin;
400628b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
400728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
40087e8381f9SStefano Zampini   if (!cusp->cooPerm) {
40099566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
40109566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
40113ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
40127e8381f9SStefano Zampini   }
40137e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
401428b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4015e61fc153SStefano Zampini   if (!v) {
4016e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4017e61fc153SStefano Zampini     goto finalize;
40187e8381f9SStefano Zampini   }
4019e61fc153SStefano Zampini   n = cusp->cooPerm->size();
402008391a17SStefano Zampini   if (isCudaMem(v)) {
402108391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
402208391a17SStefano Zampini   } else {
4023e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4024e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
402508391a17SStefano Zampini     d_v = cooPerm_v->data();
40269566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
402708391a17SStefano Zampini   }
40289566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4029e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4030ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4031bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
403208391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4033ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4034ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4035ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4036ddea5d60SJunchao Zhang       */
4037e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4038e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4039e61fc153SStefano Zampini       delete cooPerm_w;
40407e8381f9SStefano Zampini     } else {
4041ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
40429371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40439371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4044ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
40457e8381f9SStefano Zampini     }
40467e8381f9SStefano Zampini   } else {
4047e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
404808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4049e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
40507e8381f9SStefano Zampini     } else {
40519371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40529371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
40537e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
40547e8381f9SStefano Zampini     }
40557e8381f9SStefano Zampini   }
40569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4057e61fc153SStefano Zampini finalize:
4058e61fc153SStefano Zampini   delete cooPerm_v;
40597e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
40609566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4061fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
40629566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
40639566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
40649566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4065fcdce8c4SStefano Zampini   a->reallocs = 0;
4066fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4067fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4068fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4069fcdce8c4SStefano Zampini   A->num_ass++;
40703ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40717e8381f9SStefano Zampini }
40727e8381f9SStefano Zampini 
4073d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4074d71ae5a4SJacob Faibussowitsch {
4075a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4076a49f1ed0SStefano Zampini 
4077a49f1ed0SStefano Zampini   PetscFunctionBegin;
4078a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
40793ba16761SJacob Faibussowitsch   if (!cusp) PetscFunctionReturn(PETSC_SUCCESS);
4080a49f1ed0SStefano Zampini   if (destroy) {
40819566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4082a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4083a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4084a49f1ed0SStefano Zampini   }
40851a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
40863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4087a49f1ed0SStefano Zampini }
4088a49f1ed0SStefano Zampini 
40897e8381f9SStefano Zampini #include <thrust/binary_search.h>
4090219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4091d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4092d71ae5a4SJacob Faibussowitsch {
40937e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
40947e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
40957e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
40967e8381f9SStefano Zampini 
40977e8381f9SStefano Zampini   PetscFunctionBegin;
40989566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
40999566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
41007e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
41017e8381f9SStefano Zampini   if (n != cooPerm_n) {
41027e8381f9SStefano Zampini     delete cusp->cooPerm;
41037e8381f9SStefano Zampini     delete cusp->cooPerm_a;
41047e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
41057e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
41067e8381f9SStefano Zampini   }
41077e8381f9SStefano Zampini   if (n) {
4108e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4109e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4110e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4111e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4112e8729f6fSJunchao Zhang 
4113e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4114e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4115e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4116e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4117e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4118e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4119e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4120e8729f6fSJunchao Zhang     } else {
4121e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4122e8729f6fSJunchao Zhang     }
4123e8729f6fSJunchao Zhang 
4124e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4125e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4126e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4127e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4128e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4129e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4130e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4131e8729f6fSJunchao Zhang     } else {
4132e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4133e8729f6fSJunchao Zhang     }
4134e8729f6fSJunchao Zhang 
41357e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
41367e8381f9SStefano Zampini 
4137ad540459SPierre Jolivet     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4138ad540459SPierre Jolivet     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
41397e8381f9SStefano Zampini 
4140ddea5d60SJunchao Zhang     /* Ex.
4141ddea5d60SJunchao Zhang       n = 6
4142ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4143ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4144ddea5d60SJunchao Zhang     */
4145e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4146e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
41477e8381f9SStefano Zampini 
41489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41497e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4150ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4151e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4152e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
41537e8381f9SStefano Zampini 
4154ddea5d60SJunchao Zhang     /*
4155ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4156ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4157ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4158ddea5d60SJunchao Zhang     */
4159ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4160ddea5d60SJunchao Zhang 
4161ddea5d60SJunchao Zhang     /*
4162ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4163ddea5d60SJunchao Zhang                             ^ekey
4164ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4165ddea5d60SJunchao Zhang                            ^nekye
4166ddea5d60SJunchao Zhang     */
41677e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
41687e8381f9SStefano Zampini       delete cusp->cooPerm_a;
41697e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4170ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4171ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4172ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4173ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4174ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
41757e8381f9SStefano Zampini       w[0]                  = 0;
4176ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4177ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
41787e8381f9SStefano Zampini     }
41797e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4180e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4181ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4182ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
41839566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41847e8381f9SStefano Zampini 
41859566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
41867e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
41877e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
41887e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
41899566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4190ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
41919566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
41927e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4193fcdce8c4SStefano Zampini     a->rmax          = 0;
41949566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
41959566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4196e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
41979566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
41989566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
41997e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
42007e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
42017e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
42027e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4203fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
42047e8381f9SStefano Zampini     }
4205fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
42067e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
42079566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
42089566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4209e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4210e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
42117e8381f9SStefano Zampini   } else {
42129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
42137e8381f9SStefano Zampini   }
42149566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
42157e8381f9SStefano Zampini 
42167e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4217e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
42189566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
42199566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
42207e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
42219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
42229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
42233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42247e8381f9SStefano Zampini }
4225ed502f03SStefano Zampini 
4226d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4227d71ae5a4SJacob Faibussowitsch {
4228219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4229219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4230cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4231219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4232219fbbafSJunchao Zhang 
4233219fbbafSJunchao Zhang   PetscFunctionBegin;
42349566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
42359566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4236219fbbafSJunchao Zhang   if (coo_i) {
42379566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4238219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4239219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
42409371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
42419371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
42429371c9d4SSatish Balay           break;
42439371c9d4SSatish Balay         }
4244219fbbafSJunchao Zhang       }
4245219fbbafSJunchao Zhang     }
4246219fbbafSJunchao Zhang   }
4247219fbbafSJunchao Zhang 
4248219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
42499566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4250219fbbafSJunchao Zhang   } else {
42519566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4252cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
42539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4254219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4255219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
42569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
42579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
42589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
42599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4260219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4261219fbbafSJunchao Zhang   }
42623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4263219fbbafSJunchao Zhang }
4264219fbbafSJunchao Zhang 
4265d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4266d71ae5a4SJacob Faibussowitsch {
4267219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4268219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4269b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4270b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4271b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4272b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4273b6c38306SJunchao Zhang   }
4274219fbbafSJunchao Zhang }
4275219fbbafSJunchao Zhang 
4276d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4277d71ae5a4SJacob Faibussowitsch {
4278219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4279219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4280219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4281219fbbafSJunchao Zhang   PetscMemType        memtype;
4282219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4283219fbbafSJunchao Zhang   PetscScalar        *Aa;
4284219fbbafSJunchao Zhang 
4285219fbbafSJunchao Zhang   PetscFunctionBegin;
4286219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
42879566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4288219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
42899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
42909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4291219fbbafSJunchao Zhang     }
4292219fbbafSJunchao Zhang 
42939566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
42949566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4295219fbbafSJunchao Zhang 
4296cbc6b225SStefano Zampini     if (Annz) {
4297b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
42989566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4299cbc6b225SStefano Zampini     }
4300219fbbafSJunchao Zhang 
43019566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
43029566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4303219fbbafSJunchao Zhang 
43049566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4305219fbbafSJunchao Zhang   } else {
43069566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4307219fbbafSJunchao Zhang   }
43083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4309219fbbafSJunchao Zhang }
4310219fbbafSJunchao Zhang 
43115b7e41feSStefano Zampini /*@C
43122ef1f0ffSBarry Smith     MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices.
43135b7e41feSStefano Zampini 
43142ef1f0ffSBarry Smith    Not Collective
43155b7e41feSStefano Zampini 
43165b7e41feSStefano Zampini     Input Parameters:
43175b7e41feSStefano Zampini +   A - the matrix
431811a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43195b7e41feSStefano Zampini 
43205b7e41feSStefano Zampini     Output Parameters:
43215b7e41feSStefano Zampini +   ia - the CSR row pointers
43225b7e41feSStefano Zampini -   ja - the CSR column indices
43235b7e41feSStefano Zampini 
43245b7e41feSStefano Zampini     Level: developer
43255b7e41feSStefano Zampini 
432611a5261eSBarry Smith     Note:
43275b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
43285b7e41feSStefano Zampini 
43292ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
43305b7e41feSStefano Zampini @*/
4331d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4332d71ae5a4SJacob Faibussowitsch {
43335f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43345f101d05SStefano Zampini   CsrMatrix          *csr;
43355f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
43365f101d05SStefano Zampini 
43375f101d05SStefano Zampini   PetscFunctionBegin;
43385f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43393ba16761SJacob Faibussowitsch   if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS);
43405f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4341aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
43429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
434328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
43445f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
43455f101d05SStefano Zampini   if (i) {
43465f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
43475f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
43485f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
43495f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
43509566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
43515f101d05SStefano Zampini       }
43525f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
43535f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
43545f101d05SStefano Zampini   }
43555f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
43563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43575f101d05SStefano Zampini }
43585f101d05SStefano Zampini 
43595b7e41feSStefano Zampini /*@C
43602ef1f0ffSBarry Smith     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
43615b7e41feSStefano Zampini 
43622ef1f0ffSBarry Smith    Not Collective
43635b7e41feSStefano Zampini 
43645b7e41feSStefano Zampini     Input Parameters:
43655b7e41feSStefano Zampini +   A - the matrix
43662ef1f0ffSBarry Smith .   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43672ef1f0ffSBarry Smith .   ia - the CSR row pointers
43685b7e41feSStefano Zampini -   ja - the CSR column indices
43695b7e41feSStefano Zampini 
43705b7e41feSStefano Zampini     Level: developer
43715b7e41feSStefano Zampini 
43722ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()`
43735b7e41feSStefano Zampini @*/
43748eb1d50fSPierre Jolivet PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j)
4375d71ae5a4SJacob Faibussowitsch {
43765f101d05SStefano Zampini   PetscFunctionBegin;
43775f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43785f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43795f101d05SStefano Zampini   if (i) *i = NULL;
43805f101d05SStefano Zampini   if (j) *j = NULL;
43813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43825f101d05SStefano Zampini }
43835f101d05SStefano Zampini 
43845b7e41feSStefano Zampini /*@C
438511a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
43865b7e41feSStefano Zampini 
43875b7e41feSStefano Zampini    Not Collective
43885b7e41feSStefano Zampini 
43895b7e41feSStefano Zampini    Input Parameter:
439011a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
43915b7e41feSStefano Zampini 
43925b7e41feSStefano Zampini    Output Parameter:
43935b7e41feSStefano Zampini .   a - pointer to the device data
43945b7e41feSStefano Zampini 
43955b7e41feSStefano Zampini    Level: developer
43965b7e41feSStefano Zampini 
439711a5261eSBarry Smith    Note:
439811a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
43995b7e41feSStefano Zampini 
44002ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
44015b7e41feSStefano Zampini @*/
4402d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4403d71ae5a4SJacob Faibussowitsch {
4404ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4405ed502f03SStefano Zampini   CsrMatrix          *csr;
4406ed502f03SStefano Zampini 
4407ed502f03SStefano Zampini   PetscFunctionBegin;
4408ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4409ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4410ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4411aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
441328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4414ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
441528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4416ed502f03SStefano Zampini   *a = csr->values->data().get();
44173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4418ed502f03SStefano Zampini }
4419ed502f03SStefano Zampini 
44205b7e41feSStefano Zampini /*@C
442111a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
44225b7e41feSStefano Zampini 
44235b7e41feSStefano Zampini    Not Collective
44245b7e41feSStefano Zampini 
44252ef1f0ffSBarry Smith    Input Parameters:
44262ef1f0ffSBarry Smith +   A - a `MATSEQAIJCUSPARSE` matrix
44272ef1f0ffSBarry Smith -   a - pointer to the device data
44285b7e41feSStefano Zampini 
44295b7e41feSStefano Zampini    Level: developer
44305b7e41feSStefano Zampini 
44312ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`
44325b7e41feSStefano Zampini @*/
4433d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4434d71ae5a4SJacob Faibussowitsch {
4435ed502f03SStefano Zampini   PetscFunctionBegin;
4436ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4437ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4438ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4439ed502f03SStefano Zampini   *a = NULL;
44403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4441ed502f03SStefano Zampini }
4442ed502f03SStefano Zampini 
44435b7e41feSStefano Zampini /*@C
444411a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
44455b7e41feSStefano Zampini 
44465b7e41feSStefano Zampini    Not Collective
44475b7e41feSStefano Zampini 
44485b7e41feSStefano Zampini    Input Parameter:
444911a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44505b7e41feSStefano Zampini 
44515b7e41feSStefano Zampini    Output Parameter:
44525b7e41feSStefano Zampini .   a - pointer to the device data
44535b7e41feSStefano Zampini 
44545b7e41feSStefano Zampini    Level: developer
44555b7e41feSStefano Zampini 
445611a5261eSBarry Smith    Note:
445711a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44585b7e41feSStefano Zampini 
44592ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
44605b7e41feSStefano Zampini @*/
4461d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4462d71ae5a4SJacob Faibussowitsch {
4463039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4464039c6fbaSStefano Zampini   CsrMatrix          *csr;
4465039c6fbaSStefano Zampini 
4466039c6fbaSStefano Zampini   PetscFunctionBegin;
4467039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4468039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4469039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4470aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
447228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4473039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
447428b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4475039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4476039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
44779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
44783ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4479039c6fbaSStefano Zampini }
44805b7e41feSStefano Zampini /*@C
448111a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4482039c6fbaSStefano Zampini 
44835b7e41feSStefano Zampini    Not Collective
44845b7e41feSStefano Zampini 
44852ef1f0ffSBarry Smith    Input Parameters:
44862ef1f0ffSBarry Smith +   A - a `MATSEQAIJCUSPARSE` matrix
44872ef1f0ffSBarry Smith -   a - pointer to the device data
44885b7e41feSStefano Zampini 
44895b7e41feSStefano Zampini    Level: developer
44905b7e41feSStefano Zampini 
44912ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`
44925b7e41feSStefano Zampini @*/
4493d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4494d71ae5a4SJacob Faibussowitsch {
4495039c6fbaSStefano Zampini   PetscFunctionBegin;
4496039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4497039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4498039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45009566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4501039c6fbaSStefano Zampini   *a = NULL;
45023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4503039c6fbaSStefano Zampini }
4504039c6fbaSStefano Zampini 
45055b7e41feSStefano Zampini /*@C
450611a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45075b7e41feSStefano Zampini 
45085b7e41feSStefano Zampini    Not Collective
45095b7e41feSStefano Zampini 
45105b7e41feSStefano Zampini    Input Parameter:
451111a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45125b7e41feSStefano Zampini 
45135b7e41feSStefano Zampini    Output Parameter:
45145b7e41feSStefano Zampini .   a - pointer to the device data
45155b7e41feSStefano Zampini 
45165b7e41feSStefano Zampini    Level: developer
45175b7e41feSStefano Zampini 
451811a5261eSBarry Smith    Note:
451911a5261eSBarry Smith    Does not trigger host-device copies and flags data validity on the GPU
45205b7e41feSStefano Zampini 
45212ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
45225b7e41feSStefano Zampini @*/
4523d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4524d71ae5a4SJacob Faibussowitsch {
4525ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4526ed502f03SStefano Zampini   CsrMatrix          *csr;
4527ed502f03SStefano Zampini 
4528ed502f03SStefano Zampini   PetscFunctionBegin;
4529ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4530ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4531ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4532aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
453328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4534ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
453528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4536ed502f03SStefano Zampini   *a             = csr->values->data().get();
4537039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
45393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4540ed502f03SStefano Zampini }
4541ed502f03SStefano Zampini 
45425b7e41feSStefano Zampini /*@C
454311a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
45445b7e41feSStefano Zampini 
45455b7e41feSStefano Zampini    Not Collective
45465b7e41feSStefano Zampini 
45472ef1f0ffSBarry Smith    Input Parameters:
45482ef1f0ffSBarry Smith +   A - a `MATSEQAIJCUSPARSE` matrix
45492ef1f0ffSBarry Smith -   a - pointer to the device data
45505b7e41feSStefano Zampini 
45515b7e41feSStefano Zampini    Level: developer
45525b7e41feSStefano Zampini 
45532ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()`
45545b7e41feSStefano Zampini @*/
4555d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4556d71ae5a4SJacob Faibussowitsch {
4557ed502f03SStefano Zampini   PetscFunctionBegin;
4558ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4559ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4560ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45619566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45629566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4563ed502f03SStefano Zampini   *a = NULL;
45643ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4565ed502f03SStefano Zampini }
4566ed502f03SStefano Zampini 
45679371c9d4SSatish Balay struct IJCompare4 {
4568d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4569d71ae5a4SJacob Faibussowitsch   {
4570ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4571ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4572ed502f03SStefano Zampini     return false;
4573ed502f03SStefano Zampini   }
4574ed502f03SStefano Zampini };
4575ed502f03SStefano Zampini 
45769371c9d4SSatish Balay struct Shift {
4577ed502f03SStefano Zampini   int _shift;
4578ed502f03SStefano Zampini 
4579ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
45809371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4581ed502f03SStefano Zampini };
4582ed502f03SStefano Zampini 
4583ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4584d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4585d71ae5a4SJacob Faibussowitsch {
4586ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4587ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4588ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4589ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4590ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4591ed502f03SStefano Zampini   cusparseStatus_t              stat;
4592ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4593ed502f03SStefano Zampini 
4594ed502f03SStefano Zampini   PetscFunctionBegin;
4595ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4596ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4597ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4598ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4599ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
46005f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
460108401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4602aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4603aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4604ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4605ed502f03SStefano Zampini     m = A->rmap->n;
4606ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
46079566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
46089566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
46099566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4610ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4611ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4612ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4613ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4614ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4615ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4616ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4617ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4618ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4619ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4620ed502f03SStefano Zampini     Ccusp->nrows            = m;
4621ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4622ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4623ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4624ed502f03SStefano Zampini     Ccsr->num_cols          = n;
46259566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
46269566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
46279566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
46289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
46299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
46309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
46319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46359566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
463628b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
463728b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4638ed502f03SStefano Zampini 
4639ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4640ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4641ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4642ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4643ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4644ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4645ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4646ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4647ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4648ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4649ed502f03SStefano Zampini     if (c->nz) {
46502ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
46512ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
46522ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
46532ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
46542ed87e7eSStefano Zampini 
4655ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4656ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4657ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4658ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
46599566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4660ed502f03SStefano Zampini         }
46612ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
46622ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4663ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4664ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4665ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4666ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
46679566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4668ed502f03SStefano Zampini         }
46692ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
46702ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
46719566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
46729371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46739371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46749371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46759371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46762ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
46772ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
46782ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
46798909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4680ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4681ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
46828909a122SStefano Zampini #else
46838909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
46848909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
46858909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
46868909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
46878909a122SStefano Zampini #endif
46882ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
46892ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
46902ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
46912ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
46922ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
46932ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4694ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4695ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4696ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4697792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
46988909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
46998909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
47008909a122SStefano Zampini #endif
47012ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
47022ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
47032ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4704792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
47052ed87e7eSStefano Zampini #else
47062ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4707792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4708792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
47092ed87e7eSStefano Zampini #endif
47109371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47119371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47129566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
47132ed87e7eSStefano Zampini       delete wPerm;
47142ed87e7eSStefano Zampini       delete Acoo;
47152ed87e7eSStefano Zampini       delete Bcoo;
47162ed87e7eSStefano Zampini       delete Ccoo;
4717ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47189371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47199371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4720ed502f03SStefano Zampini #endif
47211a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
47229566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
47239566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4724ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4725ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4726ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4727ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4728ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4729ed502f03SStefano Zampini 
47301a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
47311a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4732a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4733ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4734ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4735ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4736ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4737ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4738ed502f03SStefano Zampini 
4739ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4740ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4741ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4742ed502f03SStefano Zampini 
47439566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4744ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4745ed502f03SStefano Zampini         if (AT) {
4746ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4747ed502f03SStefano Zampini           thrust::advance(rT, -1);
4748ed502f03SStefano Zampini         }
4749ed502f03SStefano Zampini         if (BT) {
4750ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4751ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4752ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4753ed502f03SStefano Zampini         }
4754ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4755ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4756ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4757ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4758ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4759ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
47609566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4761ed502f03SStefano Zampini 
47629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
47639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
47649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
47659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
47669566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
47679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
47689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47699566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4771ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47729371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47739371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4774ed502f03SStefano Zampini #endif
4775ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4776ed502f03SStefano Zampini       }
4777ed502f03SStefano Zampini     }
4778ed502f03SStefano Zampini 
4779ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4780ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4781ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
47829566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
47839566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
4784ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4785ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4786ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4787ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4788ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
47899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4791ed502f03SStefano Zampini     } else {
47929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47939566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4794ed502f03SStefano Zampini     }
47959566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
47969566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
47979566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4798ed502f03SStefano Zampini     c->maxnz         = c->nz;
4799ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4800ed502f03SStefano Zampini     c->rmax          = 0;
4801ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4802ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4803ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4804ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4805ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4806ed502f03SStefano Zampini     }
48079566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
48089566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4809ed502f03SStefano Zampini     (*C)->nonzerostate++;
48109566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
48119566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4812ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4813ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4814ed502f03SStefano Zampini   } else {
481508401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4816ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4817ed502f03SStefano Zampini     if (c->nz) {
4818ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
48195f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4820aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
482108401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
48229566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
48239566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
48245f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
48255f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4826ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4827ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4828ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4829aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4830aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4831aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4832aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
48335f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4834ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4835ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
48369566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48379371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
48389371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4839ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
48409371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
48419371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4842ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
48439566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
48441a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
48455f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4846ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4847ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4848ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4849ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4850ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4851ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4852ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48531a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4854ed502f03SStefano Zampini       }
48559566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4856ed502f03SStefano Zampini     }
4857ed502f03SStefano Zampini   }
48589566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4859ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4860ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4861ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
48623ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4863ed502f03SStefano Zampini }
4864c215019aSStefano Zampini 
4865d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4866d71ae5a4SJacob Faibussowitsch {
4867c215019aSStefano Zampini   bool               dmem;
4868c215019aSStefano Zampini   const PetscScalar *av;
4869c215019aSStefano Zampini 
4870c215019aSStefano Zampini   PetscFunctionBegin;
4871c215019aSStefano Zampini   dmem = isCudaMem(v);
48729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4873c215019aSStefano Zampini   if (n && idx) {
4874c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4875c215019aSStefano Zampini     widx.assign(idx, idx + n);
48769566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4877c215019aSStefano Zampini 
4878c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4879c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4880c215019aSStefano Zampini     if (dmem) {
4881c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4882c215019aSStefano Zampini     } else {
4883c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4884c215019aSStefano Zampini       dv = w->data();
4885c215019aSStefano Zampini     }
4886c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4887c215019aSStefano Zampini 
4888c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4889c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4890c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
489148a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4892c215019aSStefano Zampini     delete w;
4893c215019aSStefano Zampini   } else {
48949566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4895c215019aSStefano Zampini   }
48969566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
48979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
48983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4899c215019aSStefano Zampini }
4900