xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 8eb1d50fa6bb58b38517c609409ddf07aed47a76)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
599acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
69ae82921SPaul Mullowney 
73d13b8fdSMatthew G. Knepley #include <petscconf.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
9087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
103d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
11af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
129ae82921SPaul Mullowney #undef VecType
133d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
14a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
15d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
16d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
17d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
18a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
19d0967f54SJacob Faibussowitsch #endif
20a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
21a2cee5feSJed Brown #include <thrust/remove.h>
22a2cee5feSJed Brown #include <thrust/sort.h>
23a2cee5feSJed Brown #include <thrust/unique.h>
24e8d2b73aSMark Adams 
25e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
26afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
27afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
28afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
35afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
36afb2bd1cSJunchao Zhang 
37afb2bd1cSJunchao Zhang   typedef enum {
38afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
39afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
42afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
50afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
51afb2bd1cSJunchao Zhang 
52afb2bd1cSJunchao Zhang   typedef enum {
5335cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
5435cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
55afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
56afb2bd1cSJunchao Zhang   */
57afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
60afb2bd1cSJunchao Zhang #endif
619ae82921SPaul Mullowney 
62087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
63087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
65087f3262SPaul Mullowney 
666fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
676fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
686fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
69087f3262SPaul Mullowney 
706fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
726fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
74dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
75a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
776fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
786fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
81e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
849ae82921SPaul Mullowney 
857f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
907f756511SDominic Meiser 
9157181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
92a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9357181aedSStefano Zampini 
94c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
95e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
96219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
97c215019aSStefano Zampini 
98d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
99d71ae5a4SJacob Faibussowitsch {
100aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1016e111a19SKarl Rupp 
102ca45077fSPaul Mullowney   PetscFunctionBegin;
103ca45077fSPaul Mullowney   switch (op) {
104d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
105d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
106d71ae5a4SJacob Faibussowitsch     break;
107d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
108d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
109d71ae5a4SJacob Faibussowitsch     break;
110d71ae5a4SJacob Faibussowitsch   default:
111d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
112ca45077fSPaul Mullowney   }
113ca45077fSPaul Mullowney   PetscFunctionReturn(0);
114ca45077fSPaul Mullowney }
1159ae82921SPaul Mullowney 
116e057df02SPaul Mullowney /*@
11711a5261eSBarry Smith    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11811a5261eSBarry Smith    operation. Only the `MatMult()` operation can use different GPU storage formats
11911a5261eSBarry Smith 
120e057df02SPaul Mullowney    Not Collective
121e057df02SPaul Mullowney 
122e057df02SPaul Mullowney    Input Parameters:
12311a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
12411a5261eSBarry Smith .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
12511a5261eSBarry Smith         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12611a5261eSBarry Smith -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
127e057df02SPaul Mullowney 
128e057df02SPaul Mullowney    Output Parameter:
129e057df02SPaul Mullowney 
130e057df02SPaul Mullowney    Level: intermediate
131e057df02SPaul Mullowney 
13211a5261eSBarry Smith .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
133e057df02SPaul Mullowney @*/
134d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
135d71ae5a4SJacob Faibussowitsch {
136e057df02SPaul Mullowney   PetscFunctionBegin;
137e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
138cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
139e057df02SPaul Mullowney   PetscFunctionReturn(0);
140e057df02SPaul Mullowney }
141e057df02SPaul Mullowney 
142d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
143d71ae5a4SJacob Faibussowitsch {
144365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
145365b711fSMark Adams 
146365b711fSMark Adams   PetscFunctionBegin;
147365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
148365b711fSMark Adams   PetscFunctionReturn(0);
149365b711fSMark Adams }
150365b711fSMark Adams 
151365b711fSMark Adams /*@
15211a5261eSBarry Smith    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
153365b711fSMark Adams 
154365b711fSMark Adams    Input Parameters:
15511a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
15611a5261eSBarry Smith -  use_cpu - set flag for using the built-in CPU `MatSolve()`
157365b711fSMark Adams 
158365b711fSMark Adams    Output Parameter:
159365b711fSMark Adams 
16011a5261eSBarry Smith    Note:
161365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
162365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
163365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
164365b711fSMark Adams 
165365b711fSMark Adams    Level: intermediate
166365b711fSMark Adams 
16711a5261eSBarry Smith .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
168365b711fSMark Adams @*/
169d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
170d71ae5a4SJacob Faibussowitsch {
171365b711fSMark Adams   PetscFunctionBegin;
172365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
173cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
174365b711fSMark Adams   PetscFunctionReturn(0);
175365b711fSMark Adams }
176365b711fSMark Adams 
177d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
178d71ae5a4SJacob Faibussowitsch {
179e6e9a74fSStefano Zampini   PetscFunctionBegin;
1801a2c6b5cSJunchao Zhang   switch (op) {
1811a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1821a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1839566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1841a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1851a2c6b5cSJunchao Zhang     break;
186d71ae5a4SJacob Faibussowitsch   default:
187d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
188d71ae5a4SJacob Faibussowitsch     break;
189e6e9a74fSStefano Zampini   }
190e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
191e6e9a74fSStefano Zampini }
192e6e9a74fSStefano Zampini 
193bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
194bddcd29dSMark Adams 
195d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
196d71ae5a4SJacob Faibussowitsch {
197bddcd29dSMark Adams   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
198bddcd29dSMark Adams   IS                  isrow = b->row, iscol = b->col;
199bddcd29dSMark Adams   PetscBool           row_identity, col_identity;
200365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
201bddcd29dSMark Adams 
202bddcd29dSMark Adams   PetscFunctionBegin;
2039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2049566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
205bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
206bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2079566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
2089566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
209f93f8571SJunchao Zhang 
210365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
211f93f8571SJunchao Zhang     if (row_identity && col_identity) {
212bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
213bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
214bddcd29dSMark Adams     } else {
215bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
216bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
217365b711fSMark Adams     }
218f93f8571SJunchao Zhang   }
219bddcd29dSMark Adams   B->ops->matsolve          = NULL;
220bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
221bddcd29dSMark Adams 
222bddcd29dSMark Adams   /* get the triangular factors */
22348a46eb9SPierre Jolivet   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
224bddcd29dSMark Adams   PetscFunctionReturn(0);
225bddcd29dSMark Adams }
226bddcd29dSMark Adams 
227d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
228d71ae5a4SJacob Faibussowitsch {
229e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2309ae82921SPaul Mullowney   PetscBool                flg;
231a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2326e111a19SKarl Rupp 
2339ae82921SPaul Mullowney   PetscFunctionBegin;
234d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
2359ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2369371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2379566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
238afb2bd1cSJunchao Zhang 
2399371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2409566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2419566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2429566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
243afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2449371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
245afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
246ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
247aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
248a435da06SStefano Zampini   #else
249aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
250a435da06SStefano Zampini   #endif
2519371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
252aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
253afb2bd1cSJunchao Zhang 
2549371c9d4SSatish Balay     PetscCall(
2559371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
256aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
257afb2bd1cSJunchao Zhang #endif
2584c87dfd4SPaul Mullowney   }
259d0609cedSBarry Smith   PetscOptionsHeadEnd();
2609ae82921SPaul Mullowney   PetscFunctionReturn(0);
2619ae82921SPaul Mullowney }
2629ae82921SPaul Mullowney 
263d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
264d71ae5a4SJacob Faibussowitsch {
2659ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
2669ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
2679ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
268aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
2699ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
2709ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
2719ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2729ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
2739ae82921SPaul Mullowney 
2749ae82921SPaul Mullowney   PetscFunctionBegin;
275cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
276c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2779ae82921SPaul Mullowney     try {
2789ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2799ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
280da79fbbcSStefano Zampini       if (!loTriFactor) {
2812cbc15d9SMark         PetscScalar *AALo;
2822cbc15d9SMark 
2839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
2849ae82921SPaul Mullowney 
2859ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
2879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
2889ae82921SPaul Mullowney 
2899ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2909ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
2919ae82921SPaul Mullowney         AiLo[n]   = nzLower;
2929ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
2939ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
2949ae82921SPaul Mullowney         v         = aa;
2959ae82921SPaul Mullowney         vi        = aj;
2969ae82921SPaul Mullowney         offset    = 1;
2979ae82921SPaul Mullowney         rowOffset = 1;
2989ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
2999ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
300e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3019ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3029ae82921SPaul Mullowney           rowOffset += nz + 1;
3039ae82921SPaul Mullowney 
3049566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
3059566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
3069ae82921SPaul Mullowney 
3079ae82921SPaul Mullowney           offset += nz;
3089ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3099ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3109ae82921SPaul Mullowney           offset += 1;
3119ae82921SPaul Mullowney 
3129ae82921SPaul Mullowney           v += nz;
3139ae82921SPaul Mullowney           vi += nz;
3149ae82921SPaul Mullowney         }
3152205254eSKarl Rupp 
316aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3179566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
318da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
319aa372e3fSPaul Mullowney         /* Create the matrix description */
3209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
324afb2bd1cSJunchao Zhang #else
3259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
326afb2bd1cSJunchao Zhang #endif
3279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
329aa372e3fSPaul Mullowney 
330aa372e3fSPaul Mullowney         /* set the operation */
331aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
332aa372e3fSPaul Mullowney 
333aa372e3fSPaul Mullowney         /* set the matrix */
334aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
335aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
336aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
337aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
338aa372e3fSPaul Mullowney 
339aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
340aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
341aa372e3fSPaul Mullowney 
342aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
343aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
344aa372e3fSPaul Mullowney 
345aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
346aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
347aa372e3fSPaul Mullowney 
348afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3499566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
350261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3529371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3539371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
355afb2bd1cSJunchao Zhang #endif
356afb2bd1cSJunchao Zhang 
357aa372e3fSPaul Mullowney         /* perform the solve analysis */
3589371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3599f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
3609566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3619566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
362aa372e3fSPaul Mullowney 
363da79fbbcSStefano Zampini         /* assign the pointer */
364aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
3652cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
3669566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3689566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
369da79fbbcSStefano Zampini       } else { /* update values only */
37048a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
371da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3722cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
373da79fbbcSStefano Zampini         v                    = aa;
374da79fbbcSStefano Zampini         vi                   = aj;
375da79fbbcSStefano Zampini         offset               = 1;
376da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
377da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
3789566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
379da79fbbcSStefano Zampini           offset += nz;
3802cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
381da79fbbcSStefano Zampini           offset += 1;
382da79fbbcSStefano Zampini           v += nz;
383da79fbbcSStefano Zampini         }
3842cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
3859566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
386da79fbbcSStefano Zampini       }
387d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
388d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
389d71ae5a4SJacob Faibussowitsch     }
3909ae82921SPaul Mullowney   }
3919ae82921SPaul Mullowney   PetscFunctionReturn(0);
3929ae82921SPaul Mullowney }
3939ae82921SPaul Mullowney 
394d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
395d71ae5a4SJacob Faibussowitsch {
3969ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3979ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3989ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
399aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4009ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4019ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4029ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4039ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4049ae82921SPaul Mullowney 
4059ae82921SPaul Mullowney   PetscFunctionBegin;
406cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
407c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4089ae82921SPaul Mullowney     try {
4099ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4109ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
411da79fbbcSStefano Zampini       if (!upTriFactor) {
4122cbc15d9SMark         PetscScalar *AAUp;
4132cbc15d9SMark 
4149566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4152cbc15d9SMark 
4169ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4189566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4199ae82921SPaul Mullowney 
4209ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4219ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4229ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4239ae82921SPaul Mullowney         offset  = nzUpper;
4249ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4259ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4269ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4279ae82921SPaul Mullowney 
428e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4299ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4309ae82921SPaul Mullowney 
431e057df02SPaul Mullowney           /* decrement the offset */
4329ae82921SPaul Mullowney           offset -= (nz + 1);
4339ae82921SPaul Mullowney 
434e057df02SPaul Mullowney           /* first, set the diagonal elements */
4359ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
43609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4379ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4389ae82921SPaul Mullowney 
4399566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4409566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4419ae82921SPaul Mullowney         }
4422205254eSKarl Rupp 
443aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4449566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
445da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4462205254eSKarl Rupp 
447aa372e3fSPaul Mullowney         /* Create the matrix description */
4489566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4499566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4501b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4519566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
452afb2bd1cSJunchao Zhang #else
4539566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
454afb2bd1cSJunchao Zhang #endif
4559566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
457aa372e3fSPaul Mullowney 
458aa372e3fSPaul Mullowney         /* set the operation */
459aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
460aa372e3fSPaul Mullowney 
461aa372e3fSPaul Mullowney         /* set the matrix */
462aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
463aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
464aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
465aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
466aa372e3fSPaul Mullowney 
467aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
468aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
469aa372e3fSPaul Mullowney 
470aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
471aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
472aa372e3fSPaul Mullowney 
473aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
474aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
475aa372e3fSPaul Mullowney 
476afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4779566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
478261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4809371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4819371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
4829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
483afb2bd1cSJunchao Zhang #endif
484afb2bd1cSJunchao Zhang 
485aa372e3fSPaul Mullowney         /* perform the solve analysis */
4869371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4879f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
4889f7ba44dSJacob Faibussowitsch 
4899566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4909566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
491aa372e3fSPaul Mullowney 
492da79fbbcSStefano Zampini         /* assign the pointer */
493aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
4942cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
4959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
4969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
4979566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
498da79fbbcSStefano Zampini       } else {
49948a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
500da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
501da79fbbcSStefano Zampini         offset = nzUpper;
502da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
503da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
504da79fbbcSStefano Zampini 
505da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
506da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
507da79fbbcSStefano Zampini 
508da79fbbcSStefano Zampini           /* decrement the offset */
509da79fbbcSStefano Zampini           offset -= (nz + 1);
510da79fbbcSStefano Zampini 
511da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5122cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5139566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
514da79fbbcSStefano Zampini         }
5152cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5169566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
517da79fbbcSStefano Zampini       }
518d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
519d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
520d71ae5a4SJacob Faibussowitsch     }
5219ae82921SPaul Mullowney   }
5229ae82921SPaul Mullowney   PetscFunctionReturn(0);
5239ae82921SPaul Mullowney }
5249ae82921SPaul Mullowney 
525d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
526d71ae5a4SJacob Faibussowitsch {
5279ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5289ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5299ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5309ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5319ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5329ae82921SPaul Mullowney 
5339ae82921SPaul Mullowney   PetscFunctionBegin;
53428b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
5359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5372205254eSKarl Rupp 
538ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
539aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5409ae82921SPaul Mullowney 
541c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
542e057df02SPaul Mullowney   /* lower triangular indices */
5439566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
544da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
545da79fbbcSStefano Zampini     const PetscInt *r;
546da79fbbcSStefano Zampini 
5479566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
548aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
549aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5509566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5519566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
552da79fbbcSStefano Zampini   }
5539ae82921SPaul Mullowney 
554e057df02SPaul Mullowney   /* upper triangular indices */
5559566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
556da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
557da79fbbcSStefano Zampini     const PetscInt *c;
558da79fbbcSStefano Zampini 
5599566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
560aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
561aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
5629566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
5639566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
564da79fbbcSStefano Zampini   }
5659ae82921SPaul Mullowney   PetscFunctionReturn(0);
5669ae82921SPaul Mullowney }
5679ae82921SPaul Mullowney 
568d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
569d71ae5a4SJacob Faibussowitsch {
570087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
571087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
572aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
573aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
574087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
575087f3262SPaul Mullowney   PetscScalar                       *AAUp;
576087f3262SPaul Mullowney   PetscScalar                       *AALo;
577087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
578087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
579087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
580087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
581087f3262SPaul Mullowney 
582087f3262SPaul Mullowney   PetscFunctionBegin;
583cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
584c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
585087f3262SPaul Mullowney     try {
5869566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
5879566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
588da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
589087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5909566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
5919566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
592087f3262SPaul Mullowney 
593087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
594087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
595087f3262SPaul Mullowney         AiUp[n] = nzUpper;
596087f3262SPaul Mullowney         offset  = 0;
597087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
598087f3262SPaul Mullowney           /* set the pointers */
599087f3262SPaul Mullowney           v  = aa + ai[i];
600087f3262SPaul Mullowney           vj = aj + ai[i];
601087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
602087f3262SPaul Mullowney 
603087f3262SPaul Mullowney           /* first, set the diagonal elements */
604087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
60509f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
606087f3262SPaul Mullowney           AiUp[i]      = offset;
60709f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
608087f3262SPaul Mullowney 
609087f3262SPaul Mullowney           offset += 1;
610087f3262SPaul Mullowney           if (nz > 0) {
6119566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
6129566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
613087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
614087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
615087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
616087f3262SPaul Mullowney             }
617087f3262SPaul Mullowney             offset += nz;
618087f3262SPaul Mullowney           }
619087f3262SPaul Mullowney         }
620087f3262SPaul Mullowney 
621aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6229566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
623da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
624087f3262SPaul Mullowney 
625aa372e3fSPaul Mullowney         /* Create the matrix description */
6269566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6281b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
630afb2bd1cSJunchao Zhang #else
6319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
632afb2bd1cSJunchao Zhang #endif
6339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
635087f3262SPaul Mullowney 
636aa372e3fSPaul Mullowney         /* set the matrix */
637aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
638aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
639aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
640aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
641aa372e3fSPaul Mullowney 
642aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
643aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
644aa372e3fSPaul Mullowney 
645aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
646aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
647aa372e3fSPaul Mullowney 
648aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
649aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
650aa372e3fSPaul Mullowney 
651afb2bd1cSJunchao Zhang         /* set the operation */
652afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
653afb2bd1cSJunchao Zhang 
654afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6559566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
656261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6571b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6589371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6599371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
6609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
661afb2bd1cSJunchao Zhang #endif
662afb2bd1cSJunchao Zhang 
663aa372e3fSPaul Mullowney         /* perform the solve analysis */
6649371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6659f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
6669f7ba44dSJacob Faibussowitsch 
6679566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6689566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
669aa372e3fSPaul Mullowney 
670da79fbbcSStefano Zampini         /* assign the pointer */
671aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
672aa372e3fSPaul Mullowney 
673aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6749566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
675da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
676aa372e3fSPaul Mullowney 
677aa372e3fSPaul Mullowney         /* Create the matrix description */
6789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
6799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
682afb2bd1cSJunchao Zhang #else
6839566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
684afb2bd1cSJunchao Zhang #endif
6859566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
687aa372e3fSPaul Mullowney 
688aa372e3fSPaul Mullowney         /* set the operation */
689aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
690aa372e3fSPaul Mullowney 
691aa372e3fSPaul Mullowney         /* set the matrix */
692aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
693aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
694aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
695aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
696aa372e3fSPaul Mullowney 
697aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
698aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
699aa372e3fSPaul Mullowney 
700aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
701aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
702aa372e3fSPaul Mullowney 
703aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
704aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
705aa372e3fSPaul Mullowney 
706afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7079566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
708261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
7091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7109371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7119371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
7129566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
713afb2bd1cSJunchao Zhang #endif
714afb2bd1cSJunchao Zhang 
715aa372e3fSPaul Mullowney         /* perform the solve analysis */
7169371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7179f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
7189f7ba44dSJacob Faibussowitsch 
7199566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7209566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
721aa372e3fSPaul Mullowney 
722da79fbbcSStefano Zampini         /* assign the pointer */
723aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
724087f3262SPaul Mullowney 
7259566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
7269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7279566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
728da79fbbcSStefano Zampini       } else {
729da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
730da79fbbcSStefano Zampini         offset = 0;
731da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
732da79fbbcSStefano Zampini           /* set the pointers */
733da79fbbcSStefano Zampini           v  = aa + ai[i];
734da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
735da79fbbcSStefano Zampini 
736da79fbbcSStefano Zampini           /* first, set the diagonal elements */
737da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
738da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
739da79fbbcSStefano Zampini 
740da79fbbcSStefano Zampini           offset += 1;
741da79fbbcSStefano Zampini           if (nz > 0) {
7429566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
743da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
744da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
745da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
746da79fbbcSStefano Zampini             }
747da79fbbcSStefano Zampini             offset += nz;
748da79fbbcSStefano Zampini           }
749da79fbbcSStefano Zampini         }
75028b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
75128b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
752da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
753da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
7549566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
755da79fbbcSStefano Zampini       }
7569566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
7579566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
758d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
759d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
760d71ae5a4SJacob Faibussowitsch     }
761087f3262SPaul Mullowney   }
762087f3262SPaul Mullowney   PetscFunctionReturn(0);
763087f3262SPaul Mullowney }
764087f3262SPaul Mullowney 
765d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
766d71ae5a4SJacob Faibussowitsch {
767087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
768087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
769087f3262SPaul Mullowney   IS                            ip                 = a->row;
770087f3262SPaul Mullowney   PetscBool                     perm_identity;
771087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
772087f3262SPaul Mullowney 
773087f3262SPaul Mullowney   PetscFunctionBegin;
77428b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
7759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
776ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
777aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
778aa372e3fSPaul Mullowney 
779da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
780da79fbbcSStefano Zampini 
781087f3262SPaul Mullowney   /* lower triangular indices */
7829566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
783087f3262SPaul Mullowney   if (!perm_identity) {
7844e4bbfaaSStefano Zampini     IS              iip;
785da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
7864e4bbfaaSStefano Zampini 
7879566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
7889566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
7899566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
790aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
791aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
792aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7934e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
7949566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
7959566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
7969566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
7979566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
798da79fbbcSStefano Zampini   }
799087f3262SPaul Mullowney   PetscFunctionReturn(0);
800087f3262SPaul Mullowney }
801087f3262SPaul Mullowney 
802d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
803d71ae5a4SJacob Faibussowitsch {
804087f3262SPaul Mullowney   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
805087f3262SPaul Mullowney   IS          ip = b->row;
806087f3262SPaul Mullowney   PetscBool   perm_identity;
807087f3262SPaul Mullowney 
808087f3262SPaul Mullowney   PetscFunctionBegin;
8099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
8109566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
811ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
812087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8139566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
814087f3262SPaul Mullowney   if (perm_identity) {
815087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
816087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8174e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8184e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
819087f3262SPaul Mullowney   } else {
820087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
821087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
8224e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8234e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
824087f3262SPaul Mullowney   }
825087f3262SPaul Mullowney 
826087f3262SPaul Mullowney   /* get the triangular factors */
8279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
828087f3262SPaul Mullowney   PetscFunctionReturn(0);
829087f3262SPaul Mullowney }
8309ae82921SPaul Mullowney 
831d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
832d71ae5a4SJacob Faibussowitsch {
833bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
834aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
835aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
836da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
837da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
838aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
839aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
840aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
841aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
842b175d8bbSPaul Mullowney 
843bda325fcSPaul Mullowney   PetscFunctionBegin;
844aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8459566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
846da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
847aa372e3fSPaul Mullowney 
848aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
849aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
850aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
8519371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
852aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
853aa372e3fSPaul Mullowney 
854aa372e3fSPaul Mullowney   /* Create the matrix description */
8559566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
8569566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
8579566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
8589566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
8599566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
860aa372e3fSPaul Mullowney 
861aa372e3fSPaul Mullowney   /* set the operation */
862aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
863aa372e3fSPaul Mullowney 
864aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
865aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
866afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
867afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
868aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
869afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
870afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
871afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
872aa372e3fSPaul Mullowney 
873aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
874afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8759371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
8769371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
8779371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
8789566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
879afb2bd1cSJunchao Zhang #endif
880afb2bd1cSJunchao Zhang 
8819566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
8829f7ba44dSJacob Faibussowitsch   {
8839f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
8849f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
8859371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
886afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8879f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
888afb2bd1cSJunchao Zhang #else
8899f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
890afb2bd1cSJunchao Zhang #endif
8919f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
8929f7ba44dSJacob Faibussowitsch   }
8939f7ba44dSJacob Faibussowitsch 
8949566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8959566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
896aa372e3fSPaul Mullowney 
897afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
8989566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
899261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
9001b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9019371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
9029371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
9039566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
904afb2bd1cSJunchao Zhang #endif
905afb2bd1cSJunchao Zhang 
906afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9079371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
9089f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
9099f7ba44dSJacob Faibussowitsch 
9109566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
912aa372e3fSPaul Mullowney 
913da79fbbcSStefano Zampini   /* assign the pointer */
914aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
915aa372e3fSPaul Mullowney 
916aa372e3fSPaul Mullowney   /*********************************************/
917aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
918aa372e3fSPaul Mullowney   /*********************************************/
919aa372e3fSPaul Mullowney 
920aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9219566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
922da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
923aa372e3fSPaul Mullowney 
924aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
925aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
926aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
9279371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
928aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
929aa372e3fSPaul Mullowney 
930aa372e3fSPaul Mullowney   /* Create the matrix description */
9319566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9329566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
9339566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
9349566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
9359566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
936aa372e3fSPaul Mullowney 
937aa372e3fSPaul Mullowney   /* set the operation */
938aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
939aa372e3fSPaul Mullowney 
940aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
941aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
942afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
943afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
944aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
945afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
946afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
947afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
948aa372e3fSPaul Mullowney 
949aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
950afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9519371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
9529371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
9539371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
9549566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
955afb2bd1cSJunchao Zhang #endif
956afb2bd1cSJunchao Zhang 
9579566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9589f7ba44dSJacob Faibussowitsch   {
9599f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
9609f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
9619371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
962afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9639f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
964afb2bd1cSJunchao Zhang #else
9659f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
966afb2bd1cSJunchao Zhang #endif
9679f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
9689f7ba44dSJacob Faibussowitsch   }
969d49cd2b7SBarry Smith 
9709566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9719566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
972aa372e3fSPaul Mullowney 
973afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9749566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
975261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
9761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9779371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9789371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
9799566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
980afb2bd1cSJunchao Zhang #endif
981afb2bd1cSJunchao Zhang 
982afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9835f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
9849371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9859f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
986d49cd2b7SBarry Smith 
9879566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9889566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
989aa372e3fSPaul Mullowney 
990da79fbbcSStefano Zampini   /* assign the pointer */
991aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
992bda325fcSPaul Mullowney   PetscFunctionReturn(0);
993bda325fcSPaul Mullowney }
994bda325fcSPaul Mullowney 
9959371c9d4SSatish Balay struct PetscScalarToPetscInt {
9969371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
997a49f1ed0SStefano Zampini };
998a49f1ed0SStefano Zampini 
999d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1000d71ae5a4SJacob Faibussowitsch {
1001aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1002a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1003bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1004bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1005aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1006b175d8bbSPaul Mullowney 
1007bda325fcSPaul Mullowney   PetscFunctionBegin;
10089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1009a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
101028b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1011a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
101208401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
10131a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
10149566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
101648a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1017a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1018aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
10199566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1020aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
10219566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
10229566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1023aa372e3fSPaul Mullowney 
1024b06137fdSPaul Mullowney     /* set alpha and beta */
10259566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
10269566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
10279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
10289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1031b06137fdSPaul Mullowney 
1032aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1033aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1034a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1035554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1036554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1037aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1038a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1039aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1040aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1041a3fdcf43SKarl Rupp 
1042ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
104381902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1044afb2bd1cSJunchao Zhang 
1045afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10463606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
10479371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
10489371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
10499371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
10503606e59fSJunchao Zhang   #else
10513606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
10523606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
10533606e59fSJunchao Zhang 
10543606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
10553606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
10563606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
10573606e59fSJunchao Zhang         */
10583606e59fSJunchao Zhang       if (matrixT->num_entries) {
10599371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
10609371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
10613606e59fSJunchao Zhang 
10623606e59fSJunchao Zhang       } else {
10633606e59fSJunchao Zhang         matstructT->matDescr = NULL;
10643606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
10653606e59fSJunchao Zhang       }
10663606e59fSJunchao Zhang   #endif
1067afb2bd1cSJunchao Zhang #endif
1068aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1069afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1070afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1071afb2bd1cSJunchao Zhang #else
1072aa372e3fSPaul Mullowney       CsrMatrix *temp = new CsrMatrix;
107351c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
107451c6d536SStefano Zampini       /* First convert HYB to CSR */
1075aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1076aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1077aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1078aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1079aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1080aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1081aa372e3fSPaul Mullowney 
10829371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
10839371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1084aa372e3fSPaul Mullowney 
1085aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1086aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1087aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1088aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1089aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1090aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1091aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1092aa372e3fSPaul Mullowney 
10939371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
10949371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
10959371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1096aa372e3fSPaul Mullowney 
1097aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1098aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
10999566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
11009371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
11019371c9d4SSatish Balay       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
11029371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1103aa372e3fSPaul Mullowney 
1104aa372e3fSPaul Mullowney       /* assign the pointer */
1105aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
11061a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1107aa372e3fSPaul Mullowney       /* delete temporaries */
1108aa372e3fSPaul Mullowney       if (tempT) {
1109aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1110aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1111aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1112aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1113087f3262SPaul Mullowney       }
1114aa372e3fSPaul Mullowney       if (temp) {
1115aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1116aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1117aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1118aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1119aa372e3fSPaul Mullowney       }
1120afb2bd1cSJunchao Zhang #endif
1121aa372e3fSPaul Mullowney     }
1122a49f1ed0SStefano Zampini   }
1123a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1124a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1125a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
112628b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
112728b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
112828b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
112928b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
113028b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
113128b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
113228b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
113328b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1134a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1135a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1136a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
11379566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1138a49f1ed0SStefano Zampini     }
1139a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1140a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1141792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1142a49f1ed0SStefano Zampini 
1143a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1144a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1145a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1146a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
11479371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
11489371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
11499371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
11509566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1151a49f1ed0SStefano Zampini #endif
1152a49f1ed0SStefano Zampini 
11531a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
11541a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
11551a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
11561a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
11571a2c6b5cSJunchao Zhang 
11581a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
11591a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
11601a2c6b5cSJunchao Zhang         */
11619371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1162a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11639371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
11649371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1165a49f1ed0SStefano Zampini #else
11669371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11679371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1168a49f1ed0SStefano Zampini #endif
11691a2c6b5cSJunchao Zhang       } else {
11701a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
11711a2c6b5cSJunchao Zhang       }
11721a2c6b5cSJunchao Zhang 
1173a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1174792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1175a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11769566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1177a49f1ed0SStefano Zampini #endif
1178a49f1ed0SStefano Zampini     }
11799371c9d4SSatish Balay     PetscCallThrust(
11809371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1181a49f1ed0SStefano Zampini   }
11829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
11839566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1184213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1185213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1186aa372e3fSPaul Mullowney   /* assign the pointer */
1187aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
11881a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
1189bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1190bda325fcSPaul Mullowney }
1191bda325fcSPaul Mullowney 
1192a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1193d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1194d71ae5a4SJacob Faibussowitsch {
1195c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1196465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1197465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1198465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1199465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1200bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1201aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1202aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1203aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1204bda325fcSPaul Mullowney 
1205bda325fcSPaul Mullowney   PetscFunctionBegin;
1206aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1207aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1209aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1210aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1211bda325fcSPaul Mullowney   }
1212bda325fcSPaul Mullowney 
1213bda325fcSPaul Mullowney   /* Get the GPU pointers */
12149566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12159566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1216c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1217c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1218bda325fcSPaul Mullowney 
12199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1220aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12219371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1222aa372e3fSPaul Mullowney 
1223aa372e3fSPaul Mullowney   /* First, solve U */
12249f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12259f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1226aa372e3fSPaul Mullowney 
1227aa372e3fSPaul Mullowney   /* Then, solve L */
12289f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
12299f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1230aa372e3fSPaul Mullowney 
1231aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
12329371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1233aa372e3fSPaul Mullowney 
1234aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1235a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1236bda325fcSPaul Mullowney 
1237bda325fcSPaul Mullowney   /* restore */
12389566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12399566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1242bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1243bda325fcSPaul Mullowney }
1244bda325fcSPaul Mullowney 
1245d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1246d71ae5a4SJacob Faibussowitsch {
1247465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1248465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1249bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1250aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1251aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1252aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1253bda325fcSPaul Mullowney 
1254bda325fcSPaul Mullowney   PetscFunctionBegin;
1255aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1256aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12579566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1258aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1259aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1260bda325fcSPaul Mullowney   }
1261bda325fcSPaul Mullowney 
1262bda325fcSPaul Mullowney   /* Get the GPU pointers */
12639566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12649566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1265bda325fcSPaul Mullowney 
12669566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1267aa372e3fSPaul Mullowney   /* First, solve U */
12689f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12699f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1270aa372e3fSPaul Mullowney 
1271aa372e3fSPaul Mullowney   /* Then, solve L */
12729f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
12739f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1274bda325fcSPaul Mullowney 
1275bda325fcSPaul Mullowney   /* restore */
12769566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12779566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1280bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1281bda325fcSPaul Mullowney }
1282bda325fcSPaul Mullowney 
1283d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1284d71ae5a4SJacob Faibussowitsch {
1285465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1286465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1287465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1288465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
12899ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1290aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1291aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1292aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
12939ae82921SPaul Mullowney 
12949ae82921SPaul Mullowney   PetscFunctionBegin;
1295e057df02SPaul Mullowney   /* Get the GPU pointers */
12969566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12979566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1298c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1299c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
13009ae82921SPaul Mullowney 
13019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1302aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
13039371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1304aa372e3fSPaul Mullowney 
1305aa372e3fSPaul Mullowney   /* Next, solve L */
13069f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
13079f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1308aa372e3fSPaul Mullowney 
1309aa372e3fSPaul Mullowney   /* Then, solve U */
13109f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
13119f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1312d49cd2b7SBarry Smith 
13134e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
13149371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
13159ae82921SPaul Mullowney 
13169566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13179566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13189566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13209ae82921SPaul Mullowney   PetscFunctionReturn(0);
13219ae82921SPaul Mullowney }
13229ae82921SPaul Mullowney 
1323d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1324d71ae5a4SJacob Faibussowitsch {
1325465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1326465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
13279ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1328aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1329aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1330aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13319ae82921SPaul Mullowney 
13329ae82921SPaul Mullowney   PetscFunctionBegin;
1333e057df02SPaul Mullowney   /* Get the GPU pointers */
13349566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13359566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
13369ae82921SPaul Mullowney 
13379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1338aa372e3fSPaul Mullowney   /* First, solve L */
13399f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
13409f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1341d49cd2b7SBarry Smith 
1342aa372e3fSPaul Mullowney   /* Next, solve U */
13439f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
13449f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
13459ae82921SPaul Mullowney 
13469566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13479566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13489566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13509ae82921SPaul Mullowney   PetscFunctionReturn(0);
13519ae82921SPaul Mullowney }
13529ae82921SPaul Mullowney 
1353da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1354da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1355d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1356d71ae5a4SJacob Faibussowitsch {
1357da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1358da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1359da112707SJunchao Zhang   const PetscScalar            *barray;
1360da112707SJunchao Zhang   PetscScalar                  *xarray;
1361da112707SJunchao Zhang 
1362da112707SJunchao Zhang   PetscFunctionBegin;
1363da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1364da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1365da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1366da112707SJunchao Zhang 
1367da112707SJunchao Zhang   /* Solve L*y = b */
1368da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1369da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
13709371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
13719371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
137212ba2bc6SJunchao Zhang                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1373da112707SJunchao Zhang 
1374da112707SJunchao Zhang   /* Solve U*x = y */
1375da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
13769371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
13779371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1378da112707SJunchao Zhang 
1379da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1380da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1381da112707SJunchao Zhang 
1382da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1383da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1384da112707SJunchao Zhang   PetscFunctionReturn(0);
1385da112707SJunchao Zhang }
1386da112707SJunchao Zhang 
1387d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1388d71ae5a4SJacob Faibussowitsch {
1389da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1390da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1391da112707SJunchao Zhang   const PetscScalar            *barray;
1392da112707SJunchao Zhang   PetscScalar                  *xarray;
1393da112707SJunchao Zhang 
1394da112707SJunchao Zhang   PetscFunctionBegin;
139512ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1396da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
139735cb6cd3SPierre Jolivet     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
13989371c9d4SSatish Balay                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1399da112707SJunchao Zhang 
1400da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
14019371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1402da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
140312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
140412ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
140512ba2bc6SJunchao Zhang   }
1406da112707SJunchao Zhang 
140712ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
14089371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1409da112707SJunchao Zhang 
14109371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
141112ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1412da112707SJunchao Zhang   }
1413da112707SJunchao Zhang 
1414da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1415da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1416da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1417da112707SJunchao Zhang 
1418da112707SJunchao Zhang   /* Solve Ut*y = b */
1419da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1420da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14219371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
14229371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1423da112707SJunchao Zhang 
1424da112707SJunchao Zhang   /* Solve Lt*x = y */
1425da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14269371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
14279371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1428da112707SJunchao Zhang 
1429da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1430da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1431da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1432da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1433da112707SJunchao Zhang   PetscFunctionReturn(0);
1434da112707SJunchao Zhang }
1435da112707SJunchao Zhang 
1436*8eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *)
1437d71ae5a4SJacob Faibussowitsch {
1438da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1439da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1440da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1441da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1442da112707SJunchao Zhang   PetscInt                      m, nz;
1443da112707SJunchao Zhang   PetscBool                     flg;
1444da112707SJunchao Zhang 
1445da112707SJunchao Zhang   PetscFunctionBegin;
1446da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1447da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1448da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1449da112707SJunchao Zhang   }
1450da112707SJunchao Zhang 
1451da112707SJunchao Zhang   /* Copy A's value to fact */
1452da112707SJunchao Zhang   m  = fact->rmap->n;
1453da112707SJunchao Zhang   nz = aij->nz;
1454da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1455da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1456da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1457da112707SJunchao Zhang 
1458da112707SJunchao Zhang   /* Factorize fact inplace */
14599371c9d4SSatish Balay   if (m)
14609371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
14619371c9d4SSatish Balay                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1462da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1463da112707SJunchao Zhang     int              numerical_zero;
1464da112707SJunchao Zhang     cusparseStatus_t status;
1465da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1466da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1467da112707SJunchao Zhang   }
1468da112707SJunchao Zhang 
146912ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
147012ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
147112ba2bc6SJunchao Zhang   */
14729371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1473da112707SJunchao Zhang 
14749371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1475da112707SJunchao Zhang 
147612ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
147712ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
147812ba2bc6SJunchao Zhang 
1479da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1480da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1481da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1482da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1483da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1484da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1485da112707SJunchao Zhang   PetscFunctionReturn(0);
1486da112707SJunchao Zhang }
1487da112707SJunchao Zhang 
1488*8eb1d50fSPierre Jolivet static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info)
1489d71ae5a4SJacob Faibussowitsch {
1490da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1491da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1492da112707SJunchao Zhang   PetscInt                      m, nz;
1493da112707SJunchao Zhang 
1494da112707SJunchao Zhang   PetscFunctionBegin;
1495da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1496da112707SJunchao Zhang     PetscInt  i;
1497da112707SJunchao Zhang     PetscBool flg, missing;
1498da112707SJunchao Zhang 
1499da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1500da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1501da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1502da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1503da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1504da112707SJunchao Zhang   }
1505da112707SJunchao Zhang 
1506da112707SJunchao Zhang   /* Free the old stale stuff */
1507da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1508da112707SJunchao Zhang 
1509da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1510da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1511da112707SJunchao Zhang    */
1512da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1513da112707SJunchao Zhang 
1514da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1515da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1516da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1517da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1518da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1519da112707SJunchao Zhang 
1520da112707SJunchao Zhang   aij->row = NULL;
1521da112707SJunchao Zhang   aij->col = NULL;
1522da112707SJunchao Zhang 
1523da112707SJunchao Zhang   /* ====================================================================== */
1524da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1525da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1526da112707SJunchao Zhang   /* ====================================================================== */
1527da112707SJunchao Zhang   const int *Ai, *Aj;
1528da112707SJunchao Zhang 
1529da112707SJunchao Zhang   m  = fact->rmap->n;
1530da112707SJunchao Zhang   nz = aij->nz;
1531da112707SJunchao Zhang 
1532da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1533da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1534da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1535da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1536da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1537da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1538da112707SJunchao Zhang 
1539da112707SJunchao Zhang   /* ====================================================================== */
1540da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1541da112707SJunchao Zhang   /* ====================================================================== */
1542da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1543da112707SJunchao Zhang   cusparseDiagType_t diagType;
1544da112707SJunchao Zhang 
1545da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1546da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1547da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1548da112707SJunchao Zhang 
1549da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1550da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1551da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1552da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1553da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1554da112707SJunchao Zhang   */
1555da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1556da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
15579371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
15589371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
15599371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1560da112707SJunchao Zhang 
1561da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1562da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
15639371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
15649371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
15659371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1566da112707SJunchao Zhang 
1567da112707SJunchao Zhang   /* ========================================================================= */
1568da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1569da112707SJunchao Zhang   /* ========================================================================= */
1570da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
15719371c9d4SSatish Balay   if (m)
15729371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
15739371c9d4SSatish Balay                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1574da112707SJunchao Zhang 
1575da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1576da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1577da112707SJunchao Zhang 
1578da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1579da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1580da112707SJunchao Zhang 
1581da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
15829371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1583da112707SJunchao Zhang 
1584da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
15859371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1586da112707SJunchao Zhang 
1587da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
158812ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
158912ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
159012ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1591da112707SJunchao Zhang    */
159212ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
159312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
159412ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1595da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
159612ba2bc6SJunchao Zhang   } else {
159712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
159812ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1599da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
160012ba2bc6SJunchao Zhang   }
1601da112707SJunchao Zhang 
1602da112707SJunchao Zhang   /* ========================================================================== */
1603da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1604da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1605da112707SJunchao Zhang   /* ========================================================================== */
1606da112707SJunchao Zhang   int              structural_zero;
1607da112707SJunchao Zhang   cusparseStatus_t status;
1608da112707SJunchao Zhang 
1609da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
16109371c9d4SSatish Balay   if (m)
16119371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16129371c9d4SSatish Balay                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1613da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1614da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1615da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1616da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1617da112707SJunchao Zhang   }
1618da112707SJunchao Zhang 
1619da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
16200dd8c0acSJunchao Zhang   {
1621da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
16220dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1623da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1624da112707SJunchao Zhang 
1625da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1626da112707SJunchao Zhang     Ai    = Aseq->i;
1627da112707SJunchao Zhang     Adiag = Aseq->diag;
1628da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1629da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1630da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1631da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1632da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1633da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1634da112707SJunchao Zhang         */
1635da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1636da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1637da112707SJunchao Zhang       }
1638da112707SJunchao Zhang     }
1639da112707SJunchao Zhang     fs->numericFactFlops = flops;
16400dd8c0acSJunchao Zhang   }
1641da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1642da112707SJunchao Zhang   PetscFunctionReturn(0);
1643da112707SJunchao Zhang }
1644da112707SJunchao Zhang 
1645d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1646d71ae5a4SJacob Faibussowitsch {
1647da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1648da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1649da112707SJunchao Zhang   const PetscScalar            *barray;
1650da112707SJunchao Zhang   PetscScalar                  *xarray;
1651da112707SJunchao Zhang 
1652da112707SJunchao Zhang   PetscFunctionBegin;
1653da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1654da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1655da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1656da112707SJunchao Zhang 
1657da112707SJunchao Zhang   /* Solve L*y = b */
1658da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1659da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
16609371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
16619371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1662da112707SJunchao Zhang 
1663da112707SJunchao Zhang   /* Solve Lt*x = y */
1664da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
16659371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
16669371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1667da112707SJunchao Zhang 
1668da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1669da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1670da112707SJunchao Zhang 
1671da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1672da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1673da112707SJunchao Zhang   PetscFunctionReturn(0);
1674da112707SJunchao Zhang }
1675da112707SJunchao Zhang 
1676*8eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *)
1677d71ae5a4SJacob Faibussowitsch {
1678da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1679da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1680da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1681da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1682da112707SJunchao Zhang   PetscInt                      m, nz;
1683da112707SJunchao Zhang   PetscBool                     flg;
1684da112707SJunchao Zhang 
1685da112707SJunchao Zhang   PetscFunctionBegin;
1686da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1687da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1688da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1689da112707SJunchao Zhang   }
1690da112707SJunchao Zhang 
1691da112707SJunchao Zhang   /* Copy A's value to fact */
1692da112707SJunchao Zhang   m  = fact->rmap->n;
1693da112707SJunchao Zhang   nz = aij->nz;
1694da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1695da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1696da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1697da112707SJunchao Zhang 
1698da112707SJunchao Zhang   /* Factorize fact inplace */
1699da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1700da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1701da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1702da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1703da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1704da112707SJunchao Zhang    */
17059371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1706da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1707da112707SJunchao Zhang     int              numerical_zero;
1708da112707SJunchao Zhang     cusparseStatus_t status;
1709da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1710da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1711da112707SJunchao Zhang   }
1712da112707SJunchao Zhang 
17139371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1714da112707SJunchao Zhang 
1715da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1716da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1717da112707SJunchao Zhang   */
17189371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1719da112707SJunchao Zhang 
1720da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1721da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1722da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1723da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1724da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1725da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1726da112707SJunchao Zhang   PetscFunctionReturn(0);
1727da112707SJunchao Zhang }
1728da112707SJunchao Zhang 
1729*8eb1d50fSPierre Jolivet static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info)
1730d71ae5a4SJacob Faibussowitsch {
1731da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1732da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1733da112707SJunchao Zhang   PetscInt                      m, nz;
1734da112707SJunchao Zhang 
1735da112707SJunchao Zhang   PetscFunctionBegin;
1736da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1737da112707SJunchao Zhang     PetscInt  i;
1738da112707SJunchao Zhang     PetscBool flg, missing;
1739da112707SJunchao Zhang 
1740da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1741da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1742da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1743da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1744da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1745da112707SJunchao Zhang   }
1746da112707SJunchao Zhang 
1747da112707SJunchao Zhang   /* Free the old stale stuff */
1748da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1749da112707SJunchao Zhang 
1750da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1751da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1752da112707SJunchao Zhang    */
1753da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1754da112707SJunchao Zhang 
1755da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1756da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
1757da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1758da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1759da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1760da112707SJunchao Zhang 
1761da112707SJunchao Zhang   aij->row = NULL;
1762da112707SJunchao Zhang   aij->col = NULL;
1763da112707SJunchao Zhang 
1764da112707SJunchao Zhang   /* ====================================================================== */
1765da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1766da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1767da112707SJunchao Zhang   /* ====================================================================== */
1768da112707SJunchao Zhang   const int *Ai, *Aj;
1769da112707SJunchao Zhang 
1770da112707SJunchao Zhang   m  = fact->rmap->n;
1771da112707SJunchao Zhang   nz = aij->nz;
1772da112707SJunchao Zhang 
1773da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1774da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1775da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1776da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1777da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1778da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1779da112707SJunchao Zhang 
1780da112707SJunchao Zhang   /* ====================================================================== */
1781da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
1782da112707SJunchao Zhang   /* ====================================================================== */
1783da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1784da112707SJunchao Zhang   cusparseDiagType_t diagType;
1785da112707SJunchao Zhang 
1786da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1787da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1788da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1789da112707SJunchao Zhang 
1790da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1791da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1792da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1793da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1794da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1795da112707SJunchao Zhang   */
1796da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1797da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
17989371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
17999371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18009371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1801da112707SJunchao Zhang 
1802da112707SJunchao Zhang   /* ========================================================================= */
1803da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1804da112707SJunchao Zhang   /* ========================================================================= */
1805da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
18069371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1807da112707SJunchao Zhang 
1808da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1809da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1810da112707SJunchao Zhang 
1811da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1812da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1813da112707SJunchao Zhang 
1814da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18159371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1816da112707SJunchao Zhang 
1817da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
18189371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1819da112707SJunchao Zhang 
182012ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
182112ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
182212ba2bc6SJunchao Zhang    */
182312ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
182412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
182512ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1826da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
182712ba2bc6SJunchao Zhang   } else {
182812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
182912ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
183012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
183112ba2bc6SJunchao Zhang   }
1832da112707SJunchao Zhang 
1833da112707SJunchao Zhang   /* ========================================================================== */
1834da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
1835da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
1836da112707SJunchao Zhang   /* ========================================================================== */
1837da112707SJunchao Zhang   int              structural_zero;
1838da112707SJunchao Zhang   cusparseStatus_t status;
1839da112707SJunchao Zhang 
1840da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18419371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1842da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1843da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1844da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1845da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1846da112707SJunchao Zhang   }
1847da112707SJunchao Zhang 
1848da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18490dd8c0acSJunchao Zhang   {
1850da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18510dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
1852da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1853da112707SJunchao Zhang 
1854da112707SJunchao Zhang     Ai = Aseq->i;
1855da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1856da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
1857da112707SJunchao Zhang       if (nzRow > 1) {
1858da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1859da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1860da112707SJunchao Zhang         */
1861da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1862da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1863da112707SJunchao Zhang       }
1864da112707SJunchao Zhang     }
1865da112707SJunchao Zhang     fs->numericFactFlops = flops;
18660dd8c0acSJunchao Zhang   }
1867da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1868da112707SJunchao Zhang   PetscFunctionReturn(0);
1869da112707SJunchao Zhang }
1870da112707SJunchao Zhang #endif
1871da112707SJunchao Zhang 
1872d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1873d71ae5a4SJacob Faibussowitsch {
1874da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1875da112707SJunchao Zhang 
1876da112707SJunchao Zhang   PetscFunctionBegin;
1877da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1878bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1879bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
1880da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
1881da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
1882bc996fdcSJunchao Zhang   }
1883da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
1884da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1885da112707SJunchao Zhang   } else
1886da112707SJunchao Zhang #endif
1887da112707SJunchao Zhang   {
1888da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1889da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1890da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1891da112707SJunchao Zhang   }
1892da112707SJunchao Zhang   PetscFunctionReturn(0);
1893da112707SJunchao Zhang }
1894da112707SJunchao Zhang 
1895d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1896d71ae5a4SJacob Faibussowitsch {
1897da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1898da112707SJunchao Zhang 
1899da112707SJunchao Zhang   PetscFunctionBegin;
1900da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1901da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1902da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1903da112707SJunchao Zhang   PetscFunctionReturn(0);
1904da112707SJunchao Zhang }
1905da112707SJunchao Zhang 
1906d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1907d71ae5a4SJacob Faibussowitsch {
1908da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1909da112707SJunchao Zhang 
1910da112707SJunchao Zhang   PetscFunctionBegin;
1911da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1912bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
1913bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1914da112707SJunchao Zhang   if (!info->levels && perm_identity) {
1915da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1916da112707SJunchao Zhang   } else
1917da112707SJunchao Zhang #endif
1918da112707SJunchao Zhang   {
1919da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1920da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1921da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1922da112707SJunchao Zhang   }
1923da112707SJunchao Zhang   PetscFunctionReturn(0);
1924da112707SJunchao Zhang }
1925da112707SJunchao Zhang 
1926d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1927d71ae5a4SJacob Faibussowitsch {
1928da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1929da112707SJunchao Zhang 
1930da112707SJunchao Zhang   PetscFunctionBegin;
1931da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1932da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1933da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1934da112707SJunchao Zhang   PetscFunctionReturn(0);
1935da112707SJunchao Zhang }
1936da112707SJunchao Zhang 
1937*8eb1d50fSPierre Jolivet PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type)
1938d71ae5a4SJacob Faibussowitsch {
1939841d4cb1SJunchao Zhang   PetscFunctionBegin;
1940841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
1941841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
1942841d4cb1SJunchao Zhang }
1943841d4cb1SJunchao Zhang 
1944841d4cb1SJunchao Zhang /*MC
1945841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
194611a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1947841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1948841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
194911a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1950841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
1951841d4cb1SJunchao Zhang 
1952841d4cb1SJunchao Zhang   Level: beginner
1953841d4cb1SJunchao Zhang 
195411a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1955841d4cb1SJunchao Zhang M*/
1956841d4cb1SJunchao Zhang 
1957d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1958d71ae5a4SJacob Faibussowitsch {
1959841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
1960bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
1961bc996fdcSJunchao Zhang   char     *prefix;
1962bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
1963841d4cb1SJunchao Zhang 
1964841d4cb1SJunchao Zhang   PetscFunctionBegin;
1965841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1966841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
1967841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
1968841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1969841d4cb1SJunchao Zhang 
1970bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1971bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1972bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1973bc996fdcSJunchao Zhang   PetscOptionsEnd();
1974bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1975bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1976bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1977bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1978bc996fdcSJunchao Zhang 
1979841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1980841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1981841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1982841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
1983841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1984841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1985841d4cb1SJunchao Zhang     } else {
1986841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1987841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1988841d4cb1SJunchao Zhang     }
1989841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1990841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1991841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1992841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1993841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
1994841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1995841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1996841d4cb1SJunchao Zhang     } else {
1997841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1998841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1999841d4cb1SJunchao Zhang     }
2000841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2001841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2002841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2003841d4cb1SJunchao Zhang 
2004841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2005841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2006841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2007841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2008841d4cb1SJunchao Zhang }
2009841d4cb1SJunchao Zhang 
2010d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2011d71ae5a4SJacob Faibussowitsch {
20127e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
20137e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
20140dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500
2015da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
20160dd8c0acSJunchao Zhang #endif
20177e8381f9SStefano Zampini 
20187e8381f9SStefano Zampini   PetscFunctionBegin;
20197e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
20209566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2021da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2022da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
20239566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2024da112707SJunchao Zhang     }
2025da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500
2026da112707SJunchao Zhang     else if (fs->csrVal) {
2027da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2028da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2029da112707SJunchao Zhang     }
2030da112707SJunchao Zhang #endif
20319371c9d4SSatish Balay     else
20329371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
20339566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
20349566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
20357e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
20367e8381f9SStefano Zampini   }
20377e8381f9SStefano Zampini   PetscFunctionReturn(0);
20387e8381f9SStefano Zampini }
20397e8381f9SStefano Zampini 
2040d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2041d71ae5a4SJacob Faibussowitsch {
20427e8381f9SStefano Zampini   PetscFunctionBegin;
20439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
204467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
204567a45760SJunchao Zhang   PetscFunctionReturn(0);
204667a45760SJunchao Zhang }
204767a45760SJunchao Zhang 
2048d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2049d71ae5a4SJacob Faibussowitsch {
205067a45760SJunchao Zhang   PetscFunctionBegin;
20517e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
205267a45760SJunchao Zhang   *array         = NULL;
205367a45760SJunchao Zhang   PetscFunctionReturn(0);
205467a45760SJunchao Zhang }
205567a45760SJunchao Zhang 
2056d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2057d71ae5a4SJacob Faibussowitsch {
205867a45760SJunchao Zhang   PetscFunctionBegin;
20599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
206067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
206167a45760SJunchao Zhang   PetscFunctionReturn(0);
206267a45760SJunchao Zhang }
206367a45760SJunchao Zhang 
2064*8eb1d50fSPierre Jolivet static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[])
2065d71ae5a4SJacob Faibussowitsch {
206667a45760SJunchao Zhang   PetscFunctionBegin;
206767a45760SJunchao Zhang   *array = NULL;
206867a45760SJunchao Zhang   PetscFunctionReturn(0);
206967a45760SJunchao Zhang }
207067a45760SJunchao Zhang 
2071d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2072d71ae5a4SJacob Faibussowitsch {
207367a45760SJunchao Zhang   PetscFunctionBegin;
207467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
207567a45760SJunchao Zhang   PetscFunctionReturn(0);
207667a45760SJunchao Zhang }
207767a45760SJunchao Zhang 
2078d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2079d71ae5a4SJacob Faibussowitsch {
208067a45760SJunchao Zhang   PetscFunctionBegin;
208167a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
208267a45760SJunchao Zhang   *array         = NULL;
20837e8381f9SStefano Zampini   PetscFunctionReturn(0);
20847e8381f9SStefano Zampini }
20857e8381f9SStefano Zampini 
2086d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2087d71ae5a4SJacob Faibussowitsch {
20887ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
20897ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
20907ee59b9bSJunchao Zhang 
20917ee59b9bSJunchao Zhang   PetscFunctionBegin;
20927ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
20937ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
20947ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
20957ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
20967ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
20977ee59b9bSJunchao Zhang 
20987ee59b9bSJunchao Zhang   if (i) {
20997ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21007ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
21017ee59b9bSJunchao Zhang #else
21027ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21037ee59b9bSJunchao Zhang #endif
21047ee59b9bSJunchao Zhang   }
21057ee59b9bSJunchao Zhang   if (j) {
21067ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21077ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
21087ee59b9bSJunchao Zhang #else
21097ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21107ee59b9bSJunchao Zhang #endif
21117ee59b9bSJunchao Zhang   }
21127ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
21137ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
21147ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
21157ee59b9bSJunchao Zhang }
21167ee59b9bSJunchao Zhang 
2117d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2118d71ae5a4SJacob Faibussowitsch {
2119aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
21207c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
21219ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2122213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2123aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2124abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
21259ae82921SPaul Mullowney 
21269ae82921SPaul Mullowney   PetscFunctionBegin;
212728b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2128c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2129a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2130a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2131afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
213285ba7357SStefano Zampini 
213308401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
21349566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2135afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
21369566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
21379566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
21389566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21399566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
214034d6c7a5SJose E. Roman     } else {
2141abb89eb1SStefano Zampini       PetscInt nnz;
21429566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21439566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
21449566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
21457c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
214681902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2147a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2148a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
21499ae82921SPaul Mullowney       try {
21509ae82921SPaul Mullowney         if (a->compressedrow.use) {
21519ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
21529ae82921SPaul Mullowney           ii   = a->compressedrow.i;
21539ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
21549ae82921SPaul Mullowney         } else {
2155213423ffSJunchao Zhang           m    = A->rmap->n;
2156213423ffSJunchao Zhang           ii   = a->i;
2157e6e9a74fSStefano Zampini           ridx = NULL;
21589ae82921SPaul Mullowney         }
215908401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
21609371c9d4SSatish Balay         if (!a->a) {
21619371c9d4SSatish Balay           nnz  = ii[m];
21629371c9d4SSatish Balay           both = PETSC_FALSE;
21639371c9d4SSatish Balay         } else nnz = a->nz;
216408401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
21659ae82921SPaul Mullowney 
216685ba7357SStefano Zampini         /* create cusparse matrix */
2167abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2168aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
21699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
21709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
21719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
21729ae82921SPaul Mullowney 
21739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
21749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
21759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
21769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21779566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2180b06137fdSPaul Mullowney 
2181aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2182aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2183aa372e3fSPaul Mullowney           /* set the matrix */
2184afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2185afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2186afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2187abb89eb1SStefano Zampini           mat->num_entries = nnz;
2188afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2189afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
21909ae82921SPaul Mullowney 
2191abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2192abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2193aa372e3fSPaul Mullowney 
2194abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2195abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2196aa372e3fSPaul Mullowney 
2197aa372e3fSPaul Mullowney           /* assign the pointer */
2198afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2199afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2200afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
22019371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
22029371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
22039371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2204afb2bd1cSJunchao Zhang           }
2205afb2bd1cSJunchao Zhang #endif
2206aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2207afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2208afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2209afb2bd1cSJunchao Zhang #else
2210afb2bd1cSJunchao Zhang           CsrMatrix *mat = new CsrMatrix;
2211afb2bd1cSJunchao Zhang           mat->num_rows = m;
2212afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2213abb89eb1SStefano Zampini           mat->num_entries = nnz;
2214afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2215afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2216aa372e3fSPaul Mullowney 
2217abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2218abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2219aa372e3fSPaul Mullowney 
2220abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2221abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2222aa372e3fSPaul Mullowney 
2223aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
22249566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
22259371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
22269371c9d4SSatish Balay           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
22279371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2228aa372e3fSPaul Mullowney           /* assign the pointer */
2229aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2230aa372e3fSPaul Mullowney 
2231afb2bd1cSJunchao Zhang           if (mat) {
2232afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2233afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2234afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2235afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2236087f3262SPaul Mullowney           }
2237afb2bd1cSJunchao Zhang #endif
2238087f3262SPaul Mullowney         }
2239ca45077fSPaul Mullowney 
2240aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2241213423ffSJunchao Zhang         if (a->compressedrow.use) {
2242213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2243aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2244aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2245213423ffSJunchao Zhang           tmp = m;
2246213423ffSJunchao Zhang         } else {
2247213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2248213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2249213423ffSJunchao Zhang           tmp                        = 0;
2250213423ffSJunchao Zhang         }
22519566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2252aa372e3fSPaul Mullowney 
2253aa372e3fSPaul Mullowney         /* assign the pointer */
2254aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2255d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2256d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2257d71ae5a4SJacob Faibussowitsch       }
22589566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
22599566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
226034d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
226134d6c7a5SJose E. Roman     }
2262abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
22639ae82921SPaul Mullowney   }
22649ae82921SPaul Mullowney   PetscFunctionReturn(0);
22659ae82921SPaul Mullowney }
22669ae82921SPaul Mullowney 
22679371c9d4SSatish Balay struct VecCUDAPlusEquals {
2268aa372e3fSPaul Mullowney   template <typename Tuple>
2269d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2270d71ae5a4SJacob Faibussowitsch   {
2271aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2272aa372e3fSPaul Mullowney   }
2273aa372e3fSPaul Mullowney };
2274aa372e3fSPaul Mullowney 
22759371c9d4SSatish Balay struct VecCUDAEquals {
22767e8381f9SStefano Zampini   template <typename Tuple>
2277d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2278d71ae5a4SJacob Faibussowitsch   {
22797e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
22807e8381f9SStefano Zampini   }
22817e8381f9SStefano Zampini };
22827e8381f9SStefano Zampini 
22839371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2284e6e9a74fSStefano Zampini   template <typename Tuple>
2285d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2286d71ae5a4SJacob Faibussowitsch   {
2287e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2288e6e9a74fSStefano Zampini   }
2289e6e9a74fSStefano Zampini };
2290e6e9a74fSStefano Zampini 
2291afb2bd1cSJunchao Zhang struct MatMatCusparse {
2292ccdfe979SStefano Zampini   PetscBool      cisdense;
2293ccdfe979SStefano Zampini   PetscScalar   *Bt;
2294ccdfe979SStefano Zampini   Mat            X;
2295fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2296fcdce8c4SStefano Zampini   PetscLogDouble flops;
2297fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2298b4285af6SJunchao Zhang 
2299afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2300fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2301afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2302afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2303afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2304afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2305b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2306b4285af6SJunchao Zhang   void *dBuffer4;
2307b4285af6SJunchao Zhang   void *dBuffer5;
2308b4285af6SJunchao Zhang   #endif
2309fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2310fcdce8c4SStefano Zampini   void                 *mmBuffer;
2311fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2312fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2313afb2bd1cSJunchao Zhang #endif
2314afb2bd1cSJunchao Zhang };
2315ccdfe979SStefano Zampini 
2316d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2317d71ae5a4SJacob Faibussowitsch {
2318ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2319ccdfe979SStefano Zampini 
2320ccdfe979SStefano Zampini   PetscFunctionBegin;
23219566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2322fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2323afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
23249566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
23259566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
23269566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
23279566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2328b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
23299566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
23309566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2331b4285af6SJunchao Zhang   #endif
23329566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
23339566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2334afb2bd1cSJunchao Zhang #endif
23359566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
23369566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2337ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2338ccdfe979SStefano Zampini }
2339ccdfe979SStefano Zampini 
2340ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2341ccdfe979SStefano Zampini 
2342d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2343d71ae5a4SJacob Faibussowitsch {
2344ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2345ccdfe979SStefano Zampini   Mat                           A, B;
2346afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2347ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2348ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2349ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2350ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2351ccdfe979SStefano Zampini   const PetscScalar            *barray;
2352ccdfe979SStefano Zampini   PetscScalar                  *carray;
2353ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2354ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2355ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2356ccdfe979SStefano Zampini 
2357ccdfe979SStefano Zampini   PetscFunctionBegin;
2358ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
235928b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2360ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2361ccdfe979SStefano Zampini   A      = product->A;
2362ccdfe979SStefano Zampini   B      = product->B;
23639566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
236428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2365ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2366ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
236728b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
23689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2369ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2370ccdfe979SStefano Zampini   switch (product->type) {
2371ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2372ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2373ccdfe979SStefano Zampini     mat = cusp->mat;
2374ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2375ccdfe979SStefano Zampini     m   = A->rmap->n;
2376ccdfe979SStefano Zampini     n   = B->cmap->n;
2377ccdfe979SStefano Zampini     break;
2378ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
23791a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2380e6e9a74fSStefano Zampini       mat = cusp->mat;
2381e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2382e6e9a74fSStefano Zampini     } else {
23839566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2384ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2385ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2386e6e9a74fSStefano Zampini     }
2387ccdfe979SStefano Zampini     m = A->cmap->n;
2388ccdfe979SStefano Zampini     n = B->cmap->n;
2389ccdfe979SStefano Zampini     break;
2390ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2391ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2392ccdfe979SStefano Zampini     mat = cusp->mat;
2393ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2394ccdfe979SStefano Zampini     m   = A->rmap->n;
2395ccdfe979SStefano Zampini     n   = B->rmap->n;
2396ccdfe979SStefano Zampini     break;
2397d71ae5a4SJacob Faibussowitsch   default:
2398d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2399ccdfe979SStefano Zampini   }
240028b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2401ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2402ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
24039566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
24049566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
24059566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2406afb2bd1cSJunchao Zhang 
24079566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2408c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
24099566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
24109566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2411c8378d12SStefano Zampini   } else {
24129566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
24139566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2414c8378d12SStefano Zampini   }
2415c8378d12SStefano Zampini 
24169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2417afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2418afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2419a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2420afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2421fcdce8c4SStefano Zampini     size_t mmBufferSize;
24229371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
24239371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
24249371c9d4SSatish Balay       mmdata->matBDescr = NULL;
24259371c9d4SSatish Balay     }
2426afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
24279566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2428afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2429afb2bd1cSJunchao Zhang     }
2430c8378d12SStefano Zampini 
24319371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
24329371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
24339371c9d4SSatish Balay       mmdata->matCDescr = NULL;
24349371c9d4SSatish Balay     }
2435afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
24369566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2437afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2438afb2bd1cSJunchao Zhang     }
2439afb2bd1cSJunchao Zhang 
2440afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
24419371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
24429371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
24439371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2444afb2bd1cSJunchao Zhang     }
24459371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
24469371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2447fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
24489566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
24499566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2450fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2451fcdce8c4SStefano Zampini     }
2452afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2453afb2bd1cSJunchao Zhang   } else {
2454afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
24559566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
24569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
24579566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2458afb2bd1cSJunchao Zhang   }
2459afb2bd1cSJunchao Zhang 
2460afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
24619371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
24629371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2463afb2bd1cSJunchao Zhang #else
2464afb2bd1cSJunchao Zhang   PetscInt k;
2465afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2466ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2467ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2468ccdfe979SStefano Zampini     cublasStatus_t cerr;
2469ccdfe979SStefano Zampini 
24709566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
24719371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
24729371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2473ccdfe979SStefano Zampini     blda = B->cmap->n;
2474afb2bd1cSJunchao Zhang     k = B->cmap->n;
2475afb2bd1cSJunchao Zhang   } else {
2476afb2bd1cSJunchao Zhang     k = B->rmap->n;
2477ccdfe979SStefano Zampini   }
2478ccdfe979SStefano Zampini 
2479afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
24809371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
24819371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2482afb2bd1cSJunchao Zhang #endif
24839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
24849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
24859566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2486ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
24879566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
24889566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2489ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
24909566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
24919566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2492ccdfe979SStefano Zampini   } else {
24939566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2494ccdfe979SStefano Zampini   }
249548a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
249648a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2497ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2498ccdfe979SStefano Zampini }
2499ccdfe979SStefano Zampini 
2500d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2501d71ae5a4SJacob Faibussowitsch {
2502ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2503ccdfe979SStefano Zampini   Mat                 A, B;
2504ccdfe979SStefano Zampini   PetscInt            m, n;
2505ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2506ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2507ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2508ccdfe979SStefano Zampini 
2509ccdfe979SStefano Zampini   PetscFunctionBegin;
2510ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
251128b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2512ccdfe979SStefano Zampini   A = product->A;
2513ccdfe979SStefano Zampini   B = product->B;
25149566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
251528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2516ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
251708401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2518ccdfe979SStefano Zampini   switch (product->type) {
2519ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2520ccdfe979SStefano Zampini     m = A->rmap->n;
2521ccdfe979SStefano Zampini     n = B->cmap->n;
2522ccdfe979SStefano Zampini     break;
2523ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2524ccdfe979SStefano Zampini     m = A->cmap->n;
2525ccdfe979SStefano Zampini     n = B->cmap->n;
2526ccdfe979SStefano Zampini     break;
2527ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2528ccdfe979SStefano Zampini     m = A->rmap->n;
2529ccdfe979SStefano Zampini     n = B->rmap->n;
2530ccdfe979SStefano Zampini     break;
2531ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2532ccdfe979SStefano Zampini     m = B->cmap->n;
2533ccdfe979SStefano Zampini     n = B->cmap->n;
2534ccdfe979SStefano Zampini     break;
2535ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2536ccdfe979SStefano Zampini     m = B->rmap->n;
2537ccdfe979SStefano Zampini     n = B->rmap->n;
2538ccdfe979SStefano Zampini     break;
2539d71ae5a4SJacob Faibussowitsch   default:
2540d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2541ccdfe979SStefano Zampini   }
25429566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2543ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
25449566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
25459566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2546ccdfe979SStefano Zampini 
2547ccdfe979SStefano Zampini   /* product data */
25489566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2549ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2550afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2551afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
255248a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2553afb2bd1cSJunchao Zhang #endif
2554ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2555ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
25569566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
25579566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2558ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
25599566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2560ccdfe979SStefano Zampini     } else {
25619566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2562ccdfe979SStefano Zampini     }
2563ccdfe979SStefano Zampini   }
2564ccdfe979SStefano Zampini   C->product->data    = mmdata;
2565ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2566ccdfe979SStefano Zampini 
2567ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2568ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2569ccdfe979SStefano Zampini }
2570ccdfe979SStefano Zampini 
2571d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2572d71ae5a4SJacob Faibussowitsch {
2573ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2574fcdce8c4SStefano Zampini   Mat                           A, B;
2575fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2576fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2577fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2578fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2579fcdce8c4SStefano Zampini   PetscBool                     flg;
2580fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2581fcdce8c4SStefano Zampini   MatProductType                ptype;
2582fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2583fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2584fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2585fcdce8c4SStefano Zampini #endif
2586b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2587ccdfe979SStefano Zampini 
2588ccdfe979SStefano Zampini   PetscFunctionBegin;
2589ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
259028b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
25919566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
259228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2593fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2594fcdce8c4SStefano Zampini   A      = product->A;
2595fcdce8c4SStefano Zampini   B      = product->B;
2596fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2597fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2598fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
259908401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2600fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
260128b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2602fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
260328b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2604fcdce8c4SStefano Zampini     goto finalize;
2605fcdce8c4SStefano Zampini   }
2606fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
26079566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
260828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
26099566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
261028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
261128b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
261228b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2613fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2614fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2615fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
261608401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
261708401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
261808401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
26199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
26209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2621fcdce8c4SStefano Zampini 
2622fcdce8c4SStefano Zampini   ptype = product->type;
2623b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2624fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
262528b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2626fa046f9fSJunchao Zhang   }
2627b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2628fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
262928b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2630fa046f9fSJunchao Zhang   }
2631fcdce8c4SStefano Zampini   switch (ptype) {
2632fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2633fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2634fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2635fcdce8c4SStefano Zampini     break;
2636fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2637fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2638fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2639fcdce8c4SStefano Zampini     break;
2640fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2641fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2642fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2643fcdce8c4SStefano Zampini     break;
2644d71ae5a4SJacob Faibussowitsch   default:
2645d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2646fcdce8c4SStefano Zampini   }
2647fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
264828b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
264928b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
265028b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2651fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2652fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2653fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
265428b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
265528b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
265628b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
26579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2658fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2659fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
26609566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2661b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26629371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
26639371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2664b4285af6SJunchao Zhang   #else
26659371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
26669371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
26679371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
26689371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2669b4285af6SJunchao Zhang   #endif
2670fcdce8c4SStefano Zampini #else
26719371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
26729371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
26739371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2674fcdce8c4SStefano Zampini #endif
26759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
26769566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
26779566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2678fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2679fcdce8c4SStefano Zampini finalize:
2680fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
26819566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
26829566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
26839566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2684fcdce8c4SStefano Zampini   c->reallocs = 0;
2685fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2686fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2687fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2688fcdce8c4SStefano Zampini   C->num_ass++;
2689ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2690ccdfe979SStefano Zampini }
2691fcdce8c4SStefano Zampini 
2692d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2693d71ae5a4SJacob Faibussowitsch {
2694fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2695fcdce8c4SStefano Zampini   Mat                           A, B;
2696fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2697fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2698fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2699fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2700fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
2701fcdce8c4SStefano Zampini   PetscBool                     flg;
2702fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2703fcdce8c4SStefano Zampini   MatProductType                ptype;
2704fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2705fcdce8c4SStefano Zampini   PetscLogDouble                flops;
2706fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
2707fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2708fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2709fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2710fcdce8c4SStefano Zampini #else
2711fcdce8c4SStefano Zampini   int cnz;
2712fcdce8c4SStefano Zampini #endif
2713b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2714fcdce8c4SStefano Zampini 
2715fcdce8c4SStefano Zampini   PetscFunctionBegin;
2716fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
271728b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2718fcdce8c4SStefano Zampini   A = product->A;
2719fcdce8c4SStefano Zampini   B = product->B;
27209566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
272128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
27229566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
272328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2724fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
2725fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
2726fcdce8c4SStefano Zampini   /* product data */
27279566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2728fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2729fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2730fcdce8c4SStefano Zampini 
27319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
27329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2733d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2734d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
273508401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
273608401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2737d60bce21SJunchao Zhang 
2738fcdce8c4SStefano Zampini   ptype = product->type;
2739b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2740fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2741fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2742fa046f9fSJunchao Zhang   }
2743b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2744fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2745fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2746fa046f9fSJunchao Zhang   }
2747fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2748fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2749fcdce8c4SStefano Zampini   switch (ptype) {
2750fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2751fcdce8c4SStefano Zampini     m    = A->rmap->n;
2752fcdce8c4SStefano Zampini     n    = B->cmap->n;
2753fcdce8c4SStefano Zampini     k    = A->cmap->n;
2754fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2755fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2756fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2757fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2758fcdce8c4SStefano Zampini     break;
2759fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2760fcdce8c4SStefano Zampini     m = A->cmap->n;
2761fcdce8c4SStefano Zampini     n = B->cmap->n;
2762fcdce8c4SStefano Zampini     k = A->rmap->n;
27639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2764fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2765fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2766fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2767fcdce8c4SStefano Zampini     break;
2768fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2769fcdce8c4SStefano Zampini     m = A->rmap->n;
2770fcdce8c4SStefano Zampini     n = B->rmap->n;
2771fcdce8c4SStefano Zampini     k = A->cmap->n;
27729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2773fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2774fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2775fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2776fcdce8c4SStefano Zampini     break;
2777d71ae5a4SJacob Faibussowitsch   default:
2778d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2779fcdce8c4SStefano Zampini   }
2780fcdce8c4SStefano Zampini 
2781fcdce8c4SStefano Zampini   /* create cusparse matrix */
27829566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
27839566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2784fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
2785fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2786fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2787fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2788fcdce8c4SStefano Zampini 
2789fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2790fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2791fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
27929566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
27939566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2794fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2795fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2796fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2797fcdce8c4SStefano Zampini   } else {
2798fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2799fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2800fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2801fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2802fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2803fcdce8c4SStefano Zampini   }
2804fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2805fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
2806fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
2807fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2808fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2809fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
28109566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
28119566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
28129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
28139566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
28149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
28159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
28169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28179566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2819fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2820fcdce8c4SStefano Zampini     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2821fcdce8c4SStefano Zampini     c->nz                = 0;
2822fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2823fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
2824fcdce8c4SStefano Zampini     goto finalizesym;
2825fcdce8c4SStefano Zampini   }
2826fcdce8c4SStefano Zampini 
282728b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
282828b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2829fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2830fcdce8c4SStefano Zampini   if (!biscompressed) {
2831fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
2832fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2833fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2834fcdce8c4SStefano Zampini #endif
2835fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2836fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2837fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
2838fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2839fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2840fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2841fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2842fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2843fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2844fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2845fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
28469566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2847fcdce8c4SStefano Zampini     }
2848fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2849fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
2850fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2851fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
28529371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28539371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2854fcdce8c4SStefano Zampini     }
2855fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2856fcdce8c4SStefano Zampini #endif
2857fcdce8c4SStefano Zampini   }
285828b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
285928b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2860fcdce8c4SStefano Zampini   /* precompute flops count */
2861fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2862fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2863fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2864fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
2865fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
2866fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2867fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2868fcdce8c4SStefano Zampini       }
2869fcdce8c4SStefano Zampini     }
2870fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2871fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2872fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
2873fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2874fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
2875fcdce8c4SStefano Zampini     }
2876fcdce8c4SStefano Zampini   } else { /* TODO */
2877fcdce8c4SStefano Zampini     flops = 0.;
2878fcdce8c4SStefano Zampini   }
2879fcdce8c4SStefano Zampini 
2880fcdce8c4SStefano Zampini   mmdata->flops = flops;
28819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2882b4285af6SJunchao Zhang 
2883fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28849566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
28859371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28869371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
28879566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2888b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2889b4285af6SJunchao Zhang   {
2890b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2891b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2892b4285af6SJunchao Zhang   */
2893b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
2894b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
2895b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
2896b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2897b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
2898b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
2899b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
2900b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
2901b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
2902b4285af6SJunchao Zhang 
2903b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2904b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
29059371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
29069371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29079566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2908b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
29099371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
29109371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2911b4285af6SJunchao Zhang 
2912b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29139371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
29149371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
29169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
29179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
29189371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
29199371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
29219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
2922b4285af6SJunchao Zhang 
2923b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2924b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
29259566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2926b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
2927b4285af6SJunchao Zhang     /* allocate matrix C */
29289371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29299371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29309371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
29319371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2932b4285af6SJunchao Zhang     /* update matC with the new pointers */
29339371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29349371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2935b4285af6SJunchao Zhang 
2936b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29379371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
29389371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
29409371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
29419371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
29439371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29449371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29459566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2946b4285af6SJunchao Zhang   }
2947ae37ee31SJunchao Zhang   #else
2948b4285af6SJunchao Zhang   size_t bufSize2;
2949fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
29509371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
29519371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29529566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2953fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
29549371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
29559371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2956fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
29579371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
29589371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2959fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2960fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2961fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2962fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2963fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
29649566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2965fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
29669371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29679371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2968fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
29699566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2970fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
29719371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
29729371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
2973fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29749566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2975fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
29769566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29779371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29789371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29799371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29809371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2981ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2982fcdce8c4SStefano Zampini #else
29839566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
29849371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29859371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
29869371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2987fcdce8c4SStefano Zampini   c->nz = cnz;
2988fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29899566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2990fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
29919566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2992fcdce8c4SStefano Zampini 
29939566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2994fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2995fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2996fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
29979371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29989371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
29999371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3000fcdce8c4SStefano Zampini #endif
30019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3003fcdce8c4SStefano Zampini finalizesym:
3004fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3005fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3006fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
30079566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
30089566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3009fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3010fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3011fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3012fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3013fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3014fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3015fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3018fcdce8c4SStefano Zampini   } else {
3019fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3020fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3023fcdce8c4SStefano Zampini   }
3024fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3025fcdce8c4SStefano Zampini     PetscInt r = 0;
3026fcdce8c4SStefano Zampini     c->i[0]    = 0;
3027fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3028fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3029fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3030fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3031fcdce8c4SStefano Zampini     }
3032fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3033fcdce8c4SStefano Zampini   }
30349566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
30359566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
30369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3037fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3038fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3039fcdce8c4SStefano Zampini   c->rmax          = 0;
3040fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3041fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3042fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3043fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3044fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3045fcdce8c4SStefano Zampini   }
30469566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
30479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3048fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3049fcdce8c4SStefano Zampini 
3050fcdce8c4SStefano Zampini   C->nonzerostate++;
30519566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
30529566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3053fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3054fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3055fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3056fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3057fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3058abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3059fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3060fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3061fcdce8c4SStefano Zampini   }
3062fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3063fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
3064fcdce8c4SStefano Zampini }
3065fcdce8c4SStefano Zampini 
3066fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3067fcdce8c4SStefano Zampini 
3068fcdce8c4SStefano Zampini /* handles sparse or dense B */
3069d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3070d71ae5a4SJacob Faibussowitsch {
3071fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3072fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3073fcdce8c4SStefano Zampini 
3074fcdce8c4SStefano Zampini   PetscFunctionBegin;
3075fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
30769566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
307748a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3078fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3079fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
308048a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3081fcdce8c4SStefano Zampini   }
308265e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
308365e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
308465e4b4d4SStefano Zampini     switch (product->type) {
308565e4b4d4SStefano Zampini     case MATPRODUCT_AB:
308665e4b4d4SStefano Zampini       if (product->api_user) {
3087d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
30889566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3089d0609cedSBarry Smith         PetscOptionsEnd();
309065e4b4d4SStefano Zampini       } else {
3091d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
30929566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3093d0609cedSBarry Smith         PetscOptionsEnd();
309465e4b4d4SStefano Zampini       }
309565e4b4d4SStefano Zampini       break;
309665e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
309765e4b4d4SStefano Zampini       if (product->api_user) {
3098d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
30999566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3100d0609cedSBarry Smith         PetscOptionsEnd();
310165e4b4d4SStefano Zampini       } else {
3102d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
31039566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3104d0609cedSBarry Smith         PetscOptionsEnd();
310565e4b4d4SStefano Zampini       }
310665e4b4d4SStefano Zampini       break;
310765e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
310865e4b4d4SStefano Zampini       if (product->api_user) {
3109d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
31109566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3111d0609cedSBarry Smith         PetscOptionsEnd();
311265e4b4d4SStefano Zampini       } else {
3113d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
31149566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3115d0609cedSBarry Smith         PetscOptionsEnd();
311665e4b4d4SStefano Zampini       }
311765e4b4d4SStefano Zampini       break;
311865e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
311965e4b4d4SStefano Zampini       if (product->api_user) {
3120d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
31219566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3122d0609cedSBarry Smith         PetscOptionsEnd();
312365e4b4d4SStefano Zampini       } else {
3124d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
31259566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3126d0609cedSBarry Smith         PetscOptionsEnd();
312765e4b4d4SStefano Zampini       }
312865e4b4d4SStefano Zampini       break;
312965e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
313065e4b4d4SStefano Zampini       if (product->api_user) {
3131d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
31329566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3133d0609cedSBarry Smith         PetscOptionsEnd();
313465e4b4d4SStefano Zampini       } else {
3135d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
31369566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3137d0609cedSBarry Smith         PetscOptionsEnd();
313865e4b4d4SStefano Zampini       }
313965e4b4d4SStefano Zampini       break;
3140d71ae5a4SJacob Faibussowitsch     default:
3141d71ae5a4SJacob Faibussowitsch       break;
314265e4b4d4SStefano Zampini     }
314365e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
314465e4b4d4SStefano Zampini   }
314565e4b4d4SStefano Zampini   /* dispatch */
3146fcdce8c4SStefano Zampini   if (isdense) {
3147ccdfe979SStefano Zampini     switch (product->type) {
3148ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3149ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3150ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3151ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3152ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3153fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
31549566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3155fcdce8c4SStefano Zampini       } else {
3156fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3157fcdce8c4SStefano Zampini       }
3158fcdce8c4SStefano Zampini       break;
3159d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3160d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3161d71ae5a4SJacob Faibussowitsch       break;
3162d71ae5a4SJacob Faibussowitsch     default:
3163d71ae5a4SJacob Faibussowitsch       break;
3164ccdfe979SStefano Zampini     }
3165fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3166fcdce8c4SStefano Zampini     switch (product->type) {
3167fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3168fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3169d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3170d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3171d71ae5a4SJacob Faibussowitsch       break;
3172fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3173fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3174d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3175d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3176d71ae5a4SJacob Faibussowitsch       break;
3177d71ae5a4SJacob Faibussowitsch     default:
3178d71ae5a4SJacob Faibussowitsch       break;
3179fcdce8c4SStefano Zampini     }
3180fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
31819566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3182fcdce8c4SStefano Zampini   }
3183ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3184ccdfe979SStefano Zampini }
3185ccdfe979SStefano Zampini 
3186d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3187d71ae5a4SJacob Faibussowitsch {
31889ae82921SPaul Mullowney   PetscFunctionBegin;
31899566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3190e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3191e6e9a74fSStefano Zampini }
3192e6e9a74fSStefano Zampini 
3193d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3194d71ae5a4SJacob Faibussowitsch {
3195e6e9a74fSStefano Zampini   PetscFunctionBegin;
31969566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3197e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3198e6e9a74fSStefano Zampini }
3199e6e9a74fSStefano Zampini 
3200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3201d71ae5a4SJacob Faibussowitsch {
3202e6e9a74fSStefano Zampini   PetscFunctionBegin;
32039566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3204e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3205e6e9a74fSStefano Zampini }
3206e6e9a74fSStefano Zampini 
3207d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3208d71ae5a4SJacob Faibussowitsch {
3209e6e9a74fSStefano Zampini   PetscFunctionBegin;
32109566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
32119ae82921SPaul Mullowney   PetscFunctionReturn(0);
32129ae82921SPaul Mullowney }
32139ae82921SPaul Mullowney 
3214d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3215d71ae5a4SJacob Faibussowitsch {
3216ca45077fSPaul Mullowney   PetscFunctionBegin;
32179566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3218ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3219ca45077fSPaul Mullowney }
3220ca45077fSPaul Mullowney 
3221d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3222d71ae5a4SJacob Faibussowitsch {
3223a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3224a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3225a0e72f99SJunchao Zhang }
3226a0e72f99SJunchao Zhang 
3227afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3228d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3229d71ae5a4SJacob Faibussowitsch {
32309ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3231aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
32329ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3233e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3234e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3235e6e9a74fSStefano Zampini   PetscBool                     compressed;
3236afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3237afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3238afb2bd1cSJunchao Zhang #endif
32396e111a19SKarl Rupp 
32409ae82921SPaul Mullowney   PetscFunctionBegin;
324108401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3242cbc6b225SStefano Zampini   if (!a->nz) {
32436d54fb17SJacob Faibussowitsch     if (yy) PetscCall(VecSeq_CUDA::copy(yy, zz));
32446d54fb17SJacob Faibussowitsch     else PetscCall(VecSeq_CUDA::set(zz, 0));
3245e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3246e6e9a74fSStefano Zampini   }
324734d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
32489566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3249e6e9a74fSStefano Zampini   if (!trans) {
32509ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
32515f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3252e6e9a74fSStefano Zampini   } else {
32531a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3254e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3255e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3256e6e9a74fSStefano Zampini     } else {
32579566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3258e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3259e6e9a74fSStefano Zampini     }
3260e6e9a74fSStefano Zampini   }
3261e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3262e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3263213423ffSJunchao Zhang 
3264e6e9a74fSStefano Zampini   try {
32659566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
32669566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
32679566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3268afb2bd1cSJunchao Zhang 
32699566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3270e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3271afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3272afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3273afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3274afb2bd1cSJunchao Zhang       */
3275e6e9a74fSStefano Zampini       xptr = xarray;
3276afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3277213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3278afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3279afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3280afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3281afb2bd1cSJunchao Zhang        */
3282afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3283afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3284afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3285afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3286afb2bd1cSJunchao Zhang       }
3287afb2bd1cSJunchao Zhang #endif
3288e6e9a74fSStefano Zampini     } else {
3289afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3290afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3291afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3292afb2bd1cSJunchao Zhang        */
3293afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3294e6e9a74fSStefano Zampini       dptr = zarray;
3295e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3296afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3297e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3298d0967f54SJacob Faibussowitsch 
3299d0967f54SJacob Faibussowitsch         thrust::for_each(
3300d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3301d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3302d0967f54SJacob Faibussowitsch #endif
3303d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
33049371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3305e6e9a74fSStefano Zampini       }
3306afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3307afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3308afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3309afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3310afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3311afb2bd1cSJunchao Zhang       }
3312afb2bd1cSJunchao Zhang #endif
3313e6e9a74fSStefano Zampini     }
33149ae82921SPaul Mullowney 
3315afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3316aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3317afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
33185f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3319afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
33209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
33219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
33229371c9d4SSatish Balay         PetscCallCUSPARSE(
33239371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
33249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3325afb2bd1cSJunchao Zhang 
3326afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3327afb2bd1cSJunchao Zhang       } else {
3328afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
33299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
33309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3331afb2bd1cSJunchao Zhang       }
3332afb2bd1cSJunchao Zhang 
33339371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
33349371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3335afb2bd1cSJunchao Zhang #else
33367656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
33379371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3338afb2bd1cSJunchao Zhang #endif
3339aa372e3fSPaul Mullowney     } else {
3340213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3341afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3342afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3343afb2bd1cSJunchao Zhang #else
3344301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
33459371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3346afb2bd1cSJunchao Zhang #endif
3347a65300a6SPaul Mullowney       }
3348aa372e3fSPaul Mullowney     }
33499566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3350aa372e3fSPaul Mullowney 
3351e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3352213423ffSJunchao Zhang       if (yy) {                                      /* MatMultAdd: zz = A*xx + yy */
3353213423ffSJunchao Zhang         if (compressed) {                            /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
33546d54fb17SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::copy(yy, zz));      /* zz = yy */
3355e6e9a74fSStefano Zampini         } else if (zz != yy) {                       /* A is not compressed. zz already contains A*xx, and we just need to add yy */
33566d54fb17SJacob Faibussowitsch           PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
33577656d835SStefano Zampini         }
3358213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
33596d54fb17SJacob Faibussowitsch         PetscCall(VecSeq_CUDA::set(zz, 0));
33607656d835SStefano Zampini       }
33617656d835SStefano Zampini 
3362213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3363213423ffSJunchao Zhang       if (compressed) {
33649566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3365a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3366a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3367a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3368a0e72f99SJunchao Zhang          */
3369a0e72f99SJunchao Zhang #if 0
3370a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3371a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3372a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3373e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3374c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3375a0e72f99SJunchao Zhang #else
3376a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3377a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3378a0e72f99SJunchao Zhang #endif
33799566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3380e6e9a74fSStefano Zampini       }
3381e6e9a74fSStefano Zampini     } else {
33826d54fb17SJacob Faibussowitsch       if (yy && yy != zz) PetscCall(VecSeq_CUDA::axpy(zz, 1.0, yy)); /* zz += yy */
3383e6e9a74fSStefano Zampini     }
33849566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
33859566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
33869566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3387d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3388d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3389d71ae5a4SJacob Faibussowitsch   }
3390e6e9a74fSStefano Zampini   if (yy) {
33919566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3392e6e9a74fSStefano Zampini   } else {
33939566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3394e6e9a74fSStefano Zampini   }
33959ae82921SPaul Mullowney   PetscFunctionReturn(0);
33969ae82921SPaul Mullowney }
33979ae82921SPaul Mullowney 
3398d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3399d71ae5a4SJacob Faibussowitsch {
3400ca45077fSPaul Mullowney   PetscFunctionBegin;
34019566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3402ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3403ca45077fSPaul Mullowney }
3404ca45077fSPaul Mullowney 
3405d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3406d71ae5a4SJacob Faibussowitsch {
3407042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3408042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
34093fa6b06aSMark Adams 
3410042217e8SBarry Smith   PetscFunctionBegin;
34119566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3412042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
34139566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
34149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3415042217e8SBarry Smith     cusp->deviceMat = NULL;
3416042217e8SBarry Smith   }
34179ae82921SPaul Mullowney   PetscFunctionReturn(0);
34189ae82921SPaul Mullowney }
34199ae82921SPaul Mullowney 
34209ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3421e057df02SPaul Mullowney /*@
342211a5261eSBarry Smith    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3423e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
342411a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3425e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3426e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3427e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
34289ae82921SPaul Mullowney 
3429d083f849SBarry Smith    Collective
34309ae82921SPaul Mullowney 
34319ae82921SPaul Mullowney    Input Parameters:
343211a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
34339ae82921SPaul Mullowney .  m - number of rows
34349ae82921SPaul Mullowney .  n - number of columns
34359ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
34369ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
34370298fd71SBarry Smith          (possibly different for each row) or NULL
34389ae82921SPaul Mullowney 
34399ae82921SPaul Mullowney    Output Parameter:
34409ae82921SPaul Mullowney .  A - the matrix
34419ae82921SPaul Mullowney 
344211a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
34439ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
344411a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
34459ae82921SPaul Mullowney 
34469ae82921SPaul Mullowney    Notes:
34479ae82921SPaul Mullowney    If nnz is given then nz is ignored
34489ae82921SPaul Mullowney 
344911a5261eSBarry Smith    The AIJ format, also called
345011a5261eSBarry Smith    compressed row storage, is fully compatible with standard Fortran 77
34519ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
34529ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
34539ae82921SPaul Mullowney 
34549ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
345511a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
34569ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
34579ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
34589ae82921SPaul Mullowney 
34599ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
34609ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
34619ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
34629ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
34639ae82921SPaul Mullowney 
34649ae82921SPaul Mullowney    Level: intermediate
34659ae82921SPaul Mullowney 
346611a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
34679ae82921SPaul Mullowney @*/
3468d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3469d71ae5a4SJacob Faibussowitsch {
34709ae82921SPaul Mullowney   PetscFunctionBegin;
34719566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
34729566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
34739566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
34749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
34759ae82921SPaul Mullowney   PetscFunctionReturn(0);
34769ae82921SPaul Mullowney }
34779ae82921SPaul Mullowney 
3478d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3479d71ae5a4SJacob Faibussowitsch {
34809ae82921SPaul Mullowney   PetscFunctionBegin;
34819ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
34829566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
34839ae82921SPaul Mullowney   } else {
34849566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3485aa372e3fSPaul Mullowney   }
34869566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
34879566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
34889566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
34899566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
34909566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
34919566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
34929566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
34939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
34949566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
34959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
34969566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
34979ae82921SPaul Mullowney   PetscFunctionReturn(0);
34989ae82921SPaul Mullowney }
34999ae82921SPaul Mullowney 
3500ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
350195639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3502d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3503d71ae5a4SJacob Faibussowitsch {
35049ff858a8SKarl Rupp   PetscFunctionBegin;
35059566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
35069566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
35079ff858a8SKarl Rupp   PetscFunctionReturn(0);
35089ff858a8SKarl Rupp }
35099ff858a8SKarl Rupp 
3510d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3511d71ae5a4SJacob Faibussowitsch {
3512a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3513039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3514039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3515039c6fbaSStefano Zampini   PetscScalar        *ay;
3516039c6fbaSStefano Zampini   const PetscScalar  *ax;
3517039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3518e6e9a74fSStefano Zampini 
351995639643SRichard Tran Mills   PetscFunctionBegin;
3520a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3521a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3522039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
35239566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35249566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3525a587d139SMark     PetscFunctionReturn(0);
352695639643SRichard Tran Mills   }
3527039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
35289566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
35299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
35305f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
35315f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3532039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3533039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3534039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3535039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3536039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3537ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3538039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3539039c6fbaSStefano Zampini   }
3540d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3541d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3542039c6fbaSStefano Zampini 
3543039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3544039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3545039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3546039c6fbaSStefano Zampini     size_t bufferSize;
3547039c6fbaSStefano Zampini     void  *buffer;
3548039c6fbaSStefano Zampini #endif
3549039c6fbaSStefano Zampini 
35509566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35519566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35529566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3553039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
35549371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35559371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
35569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
35579566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35589371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35599371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
35609566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35619566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3563039c6fbaSStefano Zampini #else
35649566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35659371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35669371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
35679566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3569039c6fbaSStefano Zampini #endif
35709566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
35719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3574039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3575a587d139SMark     cublasHandle_t cublasv2handle;
3576a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3577039c6fbaSStefano Zampini 
35789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35809566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
35819566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
35829566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35839566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
35849566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
35859566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35869566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3589039c6fbaSStefano Zampini   } else {
35909566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35919566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3592a587d139SMark   }
359395639643SRichard Tran Mills   PetscFunctionReturn(0);
359495639643SRichard Tran Mills }
359595639643SRichard Tran Mills 
3596d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3597d71ae5a4SJacob Faibussowitsch {
359833c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
359933c9ba73SStefano Zampini   PetscScalar   *ay;
360033c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
360133c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
360233c9ba73SStefano Zampini 
360333c9ba73SStefano Zampini   PetscFunctionBegin;
36049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
36059566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
36069566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
36079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
36089566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
36099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
36109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
36119566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
361333c9ba73SStefano Zampini   PetscFunctionReturn(0);
361433c9ba73SStefano Zampini }
361533c9ba73SStefano Zampini 
3616d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3617d71ae5a4SJacob Faibussowitsch {
36187e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3619a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
36207e8381f9SStefano Zampini 
36213fa6b06aSMark Adams   PetscFunctionBegin;
36223fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
36233fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
36247e8381f9SStefano Zampini     if (spptr->mat) {
36257e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
36267e8381f9SStefano Zampini       if (matrix->values) {
36277e8381f9SStefano Zampini         both = PETSC_TRUE;
36287e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36297e8381f9SStefano Zampini       }
36307e8381f9SStefano Zampini     }
36317e8381f9SStefano Zampini     if (spptr->matTranspose) {
36327e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3633ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36347e8381f9SStefano Zampini     }
36353fa6b06aSMark Adams   }
36369566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
36379566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
36387e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3639a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
36403fa6b06aSMark Adams   PetscFunctionReturn(0);
36413fa6b06aSMark Adams }
36423fa6b06aSMark Adams 
3643d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3644d71ae5a4SJacob Faibussowitsch {
3645a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3646a587d139SMark 
3647a587d139SMark   PetscFunctionBegin;
36489a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
36499a14fc28SStefano Zampini     A->boundtocpu = flg;
36509a14fc28SStefano Zampini     PetscFunctionReturn(0);
36519a14fc28SStefano Zampini   }
3652a587d139SMark   if (flg) {
36539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3654a587d139SMark 
365533c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3656a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3657a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3658a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3659a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3660a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3661a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3662a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3663a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3664fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
36659566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
36669566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
36679566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
36689566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
36699566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
36709566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
36719566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3672a587d139SMark   } else {
367333c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3674a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3675a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3676a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3677a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3678a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3679a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3680a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3681a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3682fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
368367a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
368467a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
368567a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
368667a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
368767a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
368867a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
36897ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
36907ee59b9bSJunchao Zhang 
36919566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
36929566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36939566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36949566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
36959566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
36969566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3697a587d139SMark   }
3698a587d139SMark   A->boundtocpu = flg;
3699ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3700ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3701ea500dcfSRichard Tran Mills   } else {
3702ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3703ea500dcfSRichard Tran Mills   }
3704a587d139SMark   PetscFunctionReturn(0);
3705a587d139SMark }
3706a587d139SMark 
3707*8eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat)
3708d71ae5a4SJacob Faibussowitsch {
370949735bf3SStefano Zampini   Mat B;
37109ae82921SPaul Mullowney 
37119ae82921SPaul Mullowney   PetscFunctionBegin;
37129566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
371349735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
37149566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
371549735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
37169566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
371749735bf3SStefano Zampini   }
371849735bf3SStefano Zampini   B = *newmat;
371949735bf3SStefano Zampini 
37209566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
37219566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
372234136279SStefano Zampini 
372349735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
37249ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3725e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
37269566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37279566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37289566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
37291a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
3730d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3731ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
3732a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3733a435da06SStefano Zampini   #else
3734d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3735a435da06SStefano Zampini   #endif
3736d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3737d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3738d8132acaSStefano Zampini #endif
37391a2c6b5cSJunchao Zhang       B->spptr = spptr;
37409ae82921SPaul Mullowney     } else {
3741e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3742e6e9a74fSStefano Zampini 
37439566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37449566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37459566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3746e6e9a74fSStefano Zampini       B->spptr = spptr;
37479ae82921SPaul Mullowney     }
3748e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
374949735bf3SStefano Zampini   }
3750693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
37519ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
37521a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
37539ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
375495639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3755693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
37562205254eSKarl Rupp 
37579566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
37589566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
37599566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3760ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
37619566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3762ae48a8d0SStefano Zampini #endif
37639566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
37649ae82921SPaul Mullowney   PetscFunctionReturn(0);
37659ae82921SPaul Mullowney }
37669ae82921SPaul Mullowney 
3767d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3768d71ae5a4SJacob Faibussowitsch {
376902fe1965SBarry Smith   PetscFunctionBegin;
37709566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
37719566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
377202fe1965SBarry Smith   PetscFunctionReturn(0);
377302fe1965SBarry Smith }
377402fe1965SBarry Smith 
37753ca39a21SBarry Smith /*MC
3776e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3777e057df02SPaul Mullowney 
377811a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
377911a5261eSBarry Smith    CSR, ELL, or Hybrid format.
378011a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3781e057df02SPaul Mullowney 
3782e057df02SPaul Mullowney    Options Database Keys:
378311a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
378411a5261eSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
378511a5261eSBarry Smith -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
378611a5261eSBarry Smith +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3787e057df02SPaul Mullowney 
3788e057df02SPaul Mullowney   Level: beginner
3789e057df02SPaul Mullowney 
379011a5261eSBarry Smith .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3791e057df02SPaul Mullowney M*/
37927f756511SDominic Meiser 
3793bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
37940f39cd5aSBarry Smith 
3795d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3796d71ae5a4SJacob Faibussowitsch {
379742c9c57cSBarry Smith   PetscFunctionBegin;
37989566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
37999566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
38009566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
38019566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
38029566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3803bddcd29dSMark Adams 
380442c9c57cSBarry Smith   PetscFunctionReturn(0);
380542c9c57cSBarry Smith }
380629b38603SBarry Smith 
3807d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3808d71ae5a4SJacob Faibussowitsch {
3809cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3810cbc6b225SStefano Zampini 
3811cbc6b225SStefano Zampini   PetscFunctionBegin;
3812cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3813cbc6b225SStefano Zampini   delete cusp->cooPerm;
3814cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3815cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
3816cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3817cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
38189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
38199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3820cbc6b225SStefano Zampini   }
3821cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3822cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3823cbc6b225SStefano Zampini }
3824cbc6b225SStefano Zampini 
3825d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3826d71ae5a4SJacob Faibussowitsch {
38277f756511SDominic Meiser   PetscFunctionBegin;
38287f756511SDominic Meiser   if (*cusparsestruct) {
38299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
38309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
38317f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
383281902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
38337e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
38347e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3835a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
38369566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
38379566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
38389566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
38399566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
38407f756511SDominic Meiser   }
38417f756511SDominic Meiser   PetscFunctionReturn(0);
38427f756511SDominic Meiser }
38437f756511SDominic Meiser 
3844d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3845d71ae5a4SJacob Faibussowitsch {
38467f756511SDominic Meiser   PetscFunctionBegin;
38477f756511SDominic Meiser   if (*mat) {
38487f756511SDominic Meiser     delete (*mat)->values;
38497f756511SDominic Meiser     delete (*mat)->column_indices;
38507f756511SDominic Meiser     delete (*mat)->row_offsets;
38517f756511SDominic Meiser     delete *mat;
38527f756511SDominic Meiser     *mat = 0;
38537f756511SDominic Meiser   }
38547f756511SDominic Meiser   PetscFunctionReturn(0);
38557f756511SDominic Meiser }
38567f756511SDominic Meiser 
3857d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3858d71ae5a4SJacob Faibussowitsch {
38597f756511SDominic Meiser   PetscFunctionBegin;
38607f756511SDominic Meiser   if (*trifactor) {
38619566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3862261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
38639566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
38649566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
38659566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3866afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38679566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3868afb2bd1cSJunchao Zhang #endif
38699566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
38707f756511SDominic Meiser   }
38717f756511SDominic Meiser   PetscFunctionReturn(0);
38727f756511SDominic Meiser }
38737f756511SDominic Meiser 
3874d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3875d71ae5a4SJacob Faibussowitsch {
38767f756511SDominic Meiser   CsrMatrix *mat;
38777f756511SDominic Meiser 
38787f756511SDominic Meiser   PetscFunctionBegin;
38797f756511SDominic Meiser   if (*matstruct) {
38807f756511SDominic Meiser     if ((*matstruct)->mat) {
38817f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3882afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3883afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3884afb2bd1cSJunchao Zhang #else
38857f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
38869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3887afb2bd1cSJunchao Zhang #endif
38887f756511SDominic Meiser       } else {
38897f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
38907f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
38917f756511SDominic Meiser       }
38927f756511SDominic Meiser     }
38939566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
38947f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
38959566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
38969566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
38979566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3898afb2bd1cSJunchao Zhang 
3899afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3900afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
39019566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3902afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
3903afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
39049566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
39059566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
39069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3907afb2bd1cSJunchao Zhang       }
3908afb2bd1cSJunchao Zhang     }
3909afb2bd1cSJunchao Zhang #endif
39107f756511SDominic Meiser     delete *matstruct;
39117e8381f9SStefano Zampini     *matstruct = NULL;
39127f756511SDominic Meiser   }
39137f756511SDominic Meiser   PetscFunctionReturn(0);
39147f756511SDominic Meiser }
39157f756511SDominic Meiser 
3916d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3917d71ae5a4SJacob Faibussowitsch {
3918da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3919da112707SJunchao Zhang 
39207f756511SDominic Meiser   PetscFunctionBegin;
3921da112707SJunchao Zhang   if (fs) {
3922da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3923da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3924da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3925da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3926da112707SJunchao Zhang     delete fs->rpermIndices;
3927da112707SJunchao Zhang     delete fs->cpermIndices;
3928da112707SJunchao Zhang     delete fs->workVector;
3929da112707SJunchao Zhang     fs->rpermIndices = NULL;
3930da112707SJunchao Zhang     fs->cpermIndices = NULL;
3931da112707SJunchao Zhang     fs->workVector   = NULL;
3932da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3933da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3934da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
3935da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
3936da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3937da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
3938da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
3939da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
3940da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
394112ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3942da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3943da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
394412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3945da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3946da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3947da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3948da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3949da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3950da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3951da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3952da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3953da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3954da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3955da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3956da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
395712ba2bc6SJunchao Zhang 
395812ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
395912ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3960da112707SJunchao Zhang #endif
3961ccdfe979SStefano Zampini   }
3962ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3963ccdfe979SStefano Zampini }
3964ccdfe979SStefano Zampini 
3965d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3966d71ae5a4SJacob Faibussowitsch {
3967ccdfe979SStefano Zampini   cusparseHandle_t handle;
3968ccdfe979SStefano Zampini 
3969ccdfe979SStefano Zampini   PetscFunctionBegin;
3970ccdfe979SStefano Zampini   if (*trifactors) {
39719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
397248a46eb9SPierre Jolivet     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
39739566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
39747f756511SDominic Meiser   }
39757f756511SDominic Meiser   PetscFunctionReturn(0);
39767f756511SDominic Meiser }
39777e8381f9SStefano Zampini 
39789371c9d4SSatish Balay struct IJCompare {
3979d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3980d71ae5a4SJacob Faibussowitsch   {
39817e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
39827e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
39837e8381f9SStefano Zampini     return false;
39847e8381f9SStefano Zampini   }
39857e8381f9SStefano Zampini };
39867e8381f9SStefano Zampini 
39879371c9d4SSatish Balay struct IJEqual {
3988d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3989d71ae5a4SJacob Faibussowitsch   {
39907e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
39917e8381f9SStefano Zampini     return true;
39927e8381f9SStefano Zampini   }
39937e8381f9SStefano Zampini };
39947e8381f9SStefano Zampini 
39959371c9d4SSatish Balay struct IJDiff {
39969371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
39977e8381f9SStefano Zampini };
39987e8381f9SStefano Zampini 
39999371c9d4SSatish Balay struct IJSum {
40009371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
40017e8381f9SStefano Zampini };
40027e8381f9SStefano Zampini 
40037e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
4004219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4005d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4006d71ae5a4SJacob Faibussowitsch {
40077e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4008fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4009bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
401008391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
40117e8381f9SStefano Zampini   CsrMatrix                            *matrix;
40127e8381f9SStefano Zampini   PetscInt                              n;
40137e8381f9SStefano Zampini 
40147e8381f9SStefano Zampini   PetscFunctionBegin;
401528b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
401628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
40177e8381f9SStefano Zampini   if (!cusp->cooPerm) {
40189566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
40199566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
40207e8381f9SStefano Zampini     PetscFunctionReturn(0);
40217e8381f9SStefano Zampini   }
40227e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
402328b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4024e61fc153SStefano Zampini   if (!v) {
4025e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4026e61fc153SStefano Zampini     goto finalize;
40277e8381f9SStefano Zampini   }
4028e61fc153SStefano Zampini   n = cusp->cooPerm->size();
402908391a17SStefano Zampini   if (isCudaMem(v)) {
403008391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
403108391a17SStefano Zampini   } else {
4032e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4033e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
403408391a17SStefano Zampini     d_v = cooPerm_v->data();
40359566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
403608391a17SStefano Zampini   }
40379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4038e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4039ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4040bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
404108391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4042ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4043ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4044ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4045ddea5d60SJunchao Zhang       */
4046e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4047e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4048e61fc153SStefano Zampini       delete cooPerm_w;
40497e8381f9SStefano Zampini     } else {
4050ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
40519371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40529371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4053ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
40547e8381f9SStefano Zampini     }
40557e8381f9SStefano Zampini   } else {
4056e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
405708391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4058e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
40597e8381f9SStefano Zampini     } else {
40609371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40619371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
40627e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
40637e8381f9SStefano Zampini     }
40647e8381f9SStefano Zampini   }
40659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4066e61fc153SStefano Zampini finalize:
4067e61fc153SStefano Zampini   delete cooPerm_v;
40687e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
40699566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4070fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
40719566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
40729566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
40739566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4074fcdce8c4SStefano Zampini   a->reallocs = 0;
4075fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4076fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4077fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4078fcdce8c4SStefano Zampini   A->num_ass++;
40797e8381f9SStefano Zampini   PetscFunctionReturn(0);
40807e8381f9SStefano Zampini }
40817e8381f9SStefano Zampini 
4082d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4083d71ae5a4SJacob Faibussowitsch {
4084a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4085a49f1ed0SStefano Zampini 
4086a49f1ed0SStefano Zampini   PetscFunctionBegin;
4087a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4088a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4089a49f1ed0SStefano Zampini   if (destroy) {
40909566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4091a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4092a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4093a49f1ed0SStefano Zampini   }
40941a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
4095a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
4096a49f1ed0SStefano Zampini }
4097a49f1ed0SStefano Zampini 
40987e8381f9SStefano Zampini #include <thrust/binary_search.h>
4099219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4100d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4101d71ae5a4SJacob Faibussowitsch {
41027e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
41037e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
41047e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
41057e8381f9SStefano Zampini 
41067e8381f9SStefano Zampini   PetscFunctionBegin;
41079566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
41089566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
41097e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
41107e8381f9SStefano Zampini   if (n != cooPerm_n) {
41117e8381f9SStefano Zampini     delete cusp->cooPerm;
41127e8381f9SStefano Zampini     delete cusp->cooPerm_a;
41137e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
41147e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
41157e8381f9SStefano Zampini   }
41167e8381f9SStefano Zampini   if (n) {
4117e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4118e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4119e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4120e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4121e8729f6fSJunchao Zhang 
4122e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4123e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4124e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4125e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4126e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4127e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4128e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4129e8729f6fSJunchao Zhang     } else {
4130e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4131e8729f6fSJunchao Zhang     }
4132e8729f6fSJunchao Zhang 
4133e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4134e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4135e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4136e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4137e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4138e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4139e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4140e8729f6fSJunchao Zhang     } else {
4141e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4142e8729f6fSJunchao Zhang     }
4143e8729f6fSJunchao Zhang 
41447e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
41457e8381f9SStefano Zampini 
4146ad540459SPierre Jolivet     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4147ad540459SPierre Jolivet     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
41487e8381f9SStefano Zampini 
4149ddea5d60SJunchao Zhang     /* Ex.
4150ddea5d60SJunchao Zhang       n = 6
4151ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4152ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4153ddea5d60SJunchao Zhang     */
4154e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4155e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
41567e8381f9SStefano Zampini 
41579566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41587e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4159ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4160e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4161e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
41627e8381f9SStefano Zampini 
4163ddea5d60SJunchao Zhang     /*
4164ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4165ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4166ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4167ddea5d60SJunchao Zhang     */
4168ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4169ddea5d60SJunchao Zhang 
4170ddea5d60SJunchao Zhang     /*
4171ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4172ddea5d60SJunchao Zhang                             ^ekey
4173ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4174ddea5d60SJunchao Zhang                            ^nekye
4175ddea5d60SJunchao Zhang     */
41767e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
41777e8381f9SStefano Zampini       delete cusp->cooPerm_a;
41787e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4179ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4180ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4181ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4182ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4183ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
41847e8381f9SStefano Zampini       w[0]                  = 0;
4185ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4186ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
41877e8381f9SStefano Zampini     }
41887e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4189e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4190ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4191ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
41929566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41937e8381f9SStefano Zampini 
41949566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
41957e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
41967e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
41977e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
41989566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4199ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
42009566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
42017e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4202fcdce8c4SStefano Zampini     a->rmax          = 0;
42039566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
42049566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4205e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
42069566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
42079566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
42087e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
42097e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
42107e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
42117e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4212fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
42137e8381f9SStefano Zampini     }
4214fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
42157e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
42169566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
42179566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4218e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4219e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
42207e8381f9SStefano Zampini   } else {
42219566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
42227e8381f9SStefano Zampini   }
42239566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
42247e8381f9SStefano Zampini 
42257e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4226e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
42279566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
42289566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
42297e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
42309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
42319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
42327e8381f9SStefano Zampini   PetscFunctionReturn(0);
42337e8381f9SStefano Zampini }
4234ed502f03SStefano Zampini 
4235d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4236d71ae5a4SJacob Faibussowitsch {
4237219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4238219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4239cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4240219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4241219fbbafSJunchao Zhang 
4242219fbbafSJunchao Zhang   PetscFunctionBegin;
42439566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
42449566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4245219fbbafSJunchao Zhang   if (coo_i) {
42469566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4247219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4248219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
42499371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
42509371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
42519371c9d4SSatish Balay           break;
42529371c9d4SSatish Balay         }
4253219fbbafSJunchao Zhang       }
4254219fbbafSJunchao Zhang     }
4255219fbbafSJunchao Zhang   }
4256219fbbafSJunchao Zhang 
4257219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
42589566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4259219fbbafSJunchao Zhang   } else {
42609566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4261cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
42629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4263219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4264219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
42659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
42669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
42679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
42689566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4269219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4270219fbbafSJunchao Zhang   }
4271219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4272219fbbafSJunchao Zhang }
4273219fbbafSJunchao Zhang 
4274d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4275d71ae5a4SJacob Faibussowitsch {
4276219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4277219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4278b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4279b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4280b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4281b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4282b6c38306SJunchao Zhang   }
4283219fbbafSJunchao Zhang }
4284219fbbafSJunchao Zhang 
4285d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4286d71ae5a4SJacob Faibussowitsch {
4287219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4288219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4289219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4290219fbbafSJunchao Zhang   PetscMemType        memtype;
4291219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4292219fbbafSJunchao Zhang   PetscScalar        *Aa;
4293219fbbafSJunchao Zhang 
4294219fbbafSJunchao Zhang   PetscFunctionBegin;
4295219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
42969566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4297219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
42989566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
42999566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4300219fbbafSJunchao Zhang     }
4301219fbbafSJunchao Zhang 
43029566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
43039566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4304219fbbafSJunchao Zhang 
4305cbc6b225SStefano Zampini     if (Annz) {
4306b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
43079566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4308cbc6b225SStefano Zampini     }
4309219fbbafSJunchao Zhang 
43109566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
43119566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4312219fbbafSJunchao Zhang 
43139566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4314219fbbafSJunchao Zhang   } else {
43159566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4316219fbbafSJunchao Zhang   }
4317219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4318219fbbafSJunchao Zhang }
4319219fbbafSJunchao Zhang 
43205b7e41feSStefano Zampini /*@C
432111a5261eSBarry Smith     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
43225b7e41feSStefano Zampini 
43235b7e41feSStefano Zampini    Not collective
43245b7e41feSStefano Zampini 
43255b7e41feSStefano Zampini     Input Parameters:
43265b7e41feSStefano Zampini +   A - the matrix
432711a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43285b7e41feSStefano Zampini 
43295b7e41feSStefano Zampini     Output Parameters:
43305b7e41feSStefano Zampini +   ia - the CSR row pointers
43315b7e41feSStefano Zampini -   ja - the CSR column indices
43325b7e41feSStefano Zampini 
43335b7e41feSStefano Zampini     Level: developer
43345b7e41feSStefano Zampini 
433511a5261eSBarry Smith     Note:
43365b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
43375b7e41feSStefano Zampini 
4338db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
43395b7e41feSStefano Zampini @*/
4340d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4341d71ae5a4SJacob Faibussowitsch {
43425f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43435f101d05SStefano Zampini   CsrMatrix          *csr;
43445f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
43455f101d05SStefano Zampini 
43465f101d05SStefano Zampini   PetscFunctionBegin;
43475f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43485f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
43495f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4350aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
43519566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
435228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
43535f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
43545f101d05SStefano Zampini   if (i) {
43555f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
43565f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
43575f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
43585f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
43599566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
43605f101d05SStefano Zampini       }
43615f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
43625f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
43635f101d05SStefano Zampini   }
43645f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
43655f101d05SStefano Zampini   PetscFunctionReturn(0);
43665f101d05SStefano Zampini }
43675f101d05SStefano Zampini 
43685b7e41feSStefano Zampini /*@C
436911a5261eSBarry Smith     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
43705b7e41feSStefano Zampini 
43715b7e41feSStefano Zampini    Not collective
43725b7e41feSStefano Zampini 
43735b7e41feSStefano Zampini     Input Parameters:
43745b7e41feSStefano Zampini +   A - the matrix
437511a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43765b7e41feSStefano Zampini 
43775b7e41feSStefano Zampini     Output Parameters:
43785b7e41feSStefano Zampini +   ia - the CSR row pointers
43795b7e41feSStefano Zampini -   ja - the CSR column indices
43805b7e41feSStefano Zampini 
43815b7e41feSStefano Zampini     Level: developer
43825b7e41feSStefano Zampini 
4383db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
43845b7e41feSStefano Zampini @*/
4385*8eb1d50fSPierre Jolivet PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool, const int **i, const int **j)
4386d71ae5a4SJacob Faibussowitsch {
43875f101d05SStefano Zampini   PetscFunctionBegin;
43885f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43895f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43905f101d05SStefano Zampini   if (i) *i = NULL;
43915f101d05SStefano Zampini   if (j) *j = NULL;
43925f101d05SStefano Zampini   PetscFunctionReturn(0);
43935f101d05SStefano Zampini }
43945f101d05SStefano Zampini 
43955b7e41feSStefano Zampini /*@C
439611a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
43975b7e41feSStefano Zampini 
43985b7e41feSStefano Zampini    Not Collective
43995b7e41feSStefano Zampini 
44005b7e41feSStefano Zampini    Input Parameter:
440111a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44025b7e41feSStefano Zampini 
44035b7e41feSStefano Zampini    Output Parameter:
44045b7e41feSStefano Zampini .   a - pointer to the device data
44055b7e41feSStefano Zampini 
44065b7e41feSStefano Zampini    Level: developer
44075b7e41feSStefano Zampini 
440811a5261eSBarry Smith    Note:
440911a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44105b7e41feSStefano Zampini 
4411db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
44125b7e41feSStefano Zampini @*/
4413d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4414d71ae5a4SJacob Faibussowitsch {
4415ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4416ed502f03SStefano Zampini   CsrMatrix          *csr;
4417ed502f03SStefano Zampini 
4418ed502f03SStefano Zampini   PetscFunctionBegin;
4419ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4420ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4421ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4422aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44239566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
442428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4425ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
442628b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4427ed502f03SStefano Zampini   *a = csr->values->data().get();
4428ed502f03SStefano Zampini   PetscFunctionReturn(0);
4429ed502f03SStefano Zampini }
4430ed502f03SStefano Zampini 
44315b7e41feSStefano Zampini /*@C
443211a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
44335b7e41feSStefano Zampini 
44345b7e41feSStefano Zampini    Not Collective
44355b7e41feSStefano Zampini 
44365b7e41feSStefano Zampini    Input Parameter:
443711a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44385b7e41feSStefano Zampini 
44395b7e41feSStefano Zampini    Output Parameter:
44405b7e41feSStefano Zampini .   a - pointer to the device data
44415b7e41feSStefano Zampini 
44425b7e41feSStefano Zampini    Level: developer
44435b7e41feSStefano Zampini 
4444db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
44455b7e41feSStefano Zampini @*/
4446d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4447d71ae5a4SJacob Faibussowitsch {
4448ed502f03SStefano Zampini   PetscFunctionBegin;
4449ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4450ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4451ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4452ed502f03SStefano Zampini   *a = NULL;
4453ed502f03SStefano Zampini   PetscFunctionReturn(0);
4454ed502f03SStefano Zampini }
4455ed502f03SStefano Zampini 
44565b7e41feSStefano Zampini /*@C
445711a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
44585b7e41feSStefano Zampini 
44595b7e41feSStefano Zampini    Not Collective
44605b7e41feSStefano Zampini 
44615b7e41feSStefano Zampini    Input Parameter:
446211a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44635b7e41feSStefano Zampini 
44645b7e41feSStefano Zampini    Output Parameter:
44655b7e41feSStefano Zampini .   a - pointer to the device data
44665b7e41feSStefano Zampini 
44675b7e41feSStefano Zampini    Level: developer
44685b7e41feSStefano Zampini 
446911a5261eSBarry Smith    Note:
447011a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44715b7e41feSStefano Zampini 
4472db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
44735b7e41feSStefano Zampini @*/
4474d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4475d71ae5a4SJacob Faibussowitsch {
4476039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4477039c6fbaSStefano Zampini   CsrMatrix          *csr;
4478039c6fbaSStefano Zampini 
4479039c6fbaSStefano Zampini   PetscFunctionBegin;
4480039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4481039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4482039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4483aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
448528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4486039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
448728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4488039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4489039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
44909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4491039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4492039c6fbaSStefano Zampini }
44935b7e41feSStefano Zampini /*@C
449411a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4495039c6fbaSStefano Zampini 
44965b7e41feSStefano Zampini    Not Collective
44975b7e41feSStefano Zampini 
44985b7e41feSStefano Zampini    Input Parameter:
449911a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45005b7e41feSStefano Zampini 
45015b7e41feSStefano Zampini    Output Parameter:
45025b7e41feSStefano Zampini .   a - pointer to the device data
45035b7e41feSStefano Zampini 
45045b7e41feSStefano Zampini    Level: developer
45055b7e41feSStefano Zampini 
4506db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
45075b7e41feSStefano Zampini @*/
4508d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4509d71ae5a4SJacob Faibussowitsch {
4510039c6fbaSStefano Zampini   PetscFunctionBegin;
4511039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4512039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4513039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45159566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4516039c6fbaSStefano Zampini   *a = NULL;
4517039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4518039c6fbaSStefano Zampini }
4519039c6fbaSStefano Zampini 
45205b7e41feSStefano Zampini /*@C
452111a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45225b7e41feSStefano Zampini 
45235b7e41feSStefano Zampini    Not Collective
45245b7e41feSStefano Zampini 
45255b7e41feSStefano Zampini    Input Parameter:
452611a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45275b7e41feSStefano Zampini 
45285b7e41feSStefano Zampini    Output Parameter:
45295b7e41feSStefano Zampini .   a - pointer to the device data
45305b7e41feSStefano Zampini 
45315b7e41feSStefano Zampini    Level: developer
45325b7e41feSStefano Zampini 
453311a5261eSBarry Smith    Note:
453411a5261eSBarry Smith    Does not trigger host-device copies and flags data validity on the GPU
45355b7e41feSStefano Zampini 
4536db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
45375b7e41feSStefano Zampini @*/
4538d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4539d71ae5a4SJacob Faibussowitsch {
4540ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4541ed502f03SStefano Zampini   CsrMatrix          *csr;
4542ed502f03SStefano Zampini 
4543ed502f03SStefano Zampini   PetscFunctionBegin;
4544ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4545ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4546ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4547aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
454828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4549ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
455028b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4551ed502f03SStefano Zampini   *a             = csr->values->data().get();
4552039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45539566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4554ed502f03SStefano Zampini   PetscFunctionReturn(0);
4555ed502f03SStefano Zampini }
4556ed502f03SStefano Zampini 
45575b7e41feSStefano Zampini /*@C
455811a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
45595b7e41feSStefano Zampini 
45605b7e41feSStefano Zampini    Not Collective
45615b7e41feSStefano Zampini 
45625b7e41feSStefano Zampini    Input Parameter:
456311a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45645b7e41feSStefano Zampini 
45655b7e41feSStefano Zampini    Output Parameter:
45665b7e41feSStefano Zampini .   a - pointer to the device data
45675b7e41feSStefano Zampini 
45685b7e41feSStefano Zampini    Level: developer
45695b7e41feSStefano Zampini 
4570db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
45715b7e41feSStefano Zampini @*/
4572d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4573d71ae5a4SJacob Faibussowitsch {
4574ed502f03SStefano Zampini   PetscFunctionBegin;
4575ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4576ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4577ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45799566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4580ed502f03SStefano Zampini   *a = NULL;
4581ed502f03SStefano Zampini   PetscFunctionReturn(0);
4582ed502f03SStefano Zampini }
4583ed502f03SStefano Zampini 
45849371c9d4SSatish Balay struct IJCompare4 {
4585d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4586d71ae5a4SJacob Faibussowitsch   {
4587ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4588ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4589ed502f03SStefano Zampini     return false;
4590ed502f03SStefano Zampini   }
4591ed502f03SStefano Zampini };
4592ed502f03SStefano Zampini 
45939371c9d4SSatish Balay struct Shift {
4594ed502f03SStefano Zampini   int _shift;
4595ed502f03SStefano Zampini 
4596ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
45979371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4598ed502f03SStefano Zampini };
4599ed502f03SStefano Zampini 
4600ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4601d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4602d71ae5a4SJacob Faibussowitsch {
4603ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4604ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4605ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4606ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4607ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4608ed502f03SStefano Zampini   cusparseStatus_t              stat;
4609ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4610ed502f03SStefano Zampini 
4611ed502f03SStefano Zampini   PetscFunctionBegin;
4612ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4613ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4614ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4615ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4616ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
46175f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
461808401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4619aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4620aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4621ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4622ed502f03SStefano Zampini     m = A->rmap->n;
4623ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
46249566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
46259566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
46269566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4627ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4628ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4629ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4630ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4631ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4632ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4633ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4634ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4635ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4636ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4637ed502f03SStefano Zampini     Ccusp->nrows            = m;
4638ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4639ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4640ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4641ed502f03SStefano Zampini     Ccsr->num_cols          = n;
46429566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
46439566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
46449566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
46459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
46469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
46479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
46489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46519566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46529566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
465328b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
465428b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4655ed502f03SStefano Zampini 
4656ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4657ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4658ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4659ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4660ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4661ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4662ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4663ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4664ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4665ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4666ed502f03SStefano Zampini     if (c->nz) {
46672ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
46682ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
46692ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
46702ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
46712ed87e7eSStefano Zampini 
4672ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4673ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4674ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4675ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
46769566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4677ed502f03SStefano Zampini         }
46782ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
46792ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4680ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4681ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4682ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4683ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
46849566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4685ed502f03SStefano Zampini         }
46862ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
46872ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
46889566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
46899371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46909371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46919371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46929371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46932ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
46942ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
46952ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
46968909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4697ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4698ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
46998909a122SStefano Zampini #else
47008909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
47018909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
47028909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
47038909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
47048909a122SStefano Zampini #endif
47052ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
47062ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
47072ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
47082ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
47092ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
47102ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4711ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4712ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4713ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4714792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
47158909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
47168909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
47178909a122SStefano Zampini #endif
47182ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
47192ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
47202ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4721792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
47222ed87e7eSStefano Zampini #else
47232ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4724792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4725792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
47262ed87e7eSStefano Zampini #endif
47279371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47289371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47299566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
47302ed87e7eSStefano Zampini       delete wPerm;
47312ed87e7eSStefano Zampini       delete Acoo;
47322ed87e7eSStefano Zampini       delete Bcoo;
47332ed87e7eSStefano Zampini       delete Ccoo;
4734ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47359371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47369371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4737ed502f03SStefano Zampini #endif
47381a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
47399566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
47409566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4741ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4742ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4743ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4744ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4745ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4746ed502f03SStefano Zampini 
47471a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
47481a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4749a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4750ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4751ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4752ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4753ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4754ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4755ed502f03SStefano Zampini 
4756ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4757ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4758ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4759ed502f03SStefano Zampini 
47609566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4761ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4762ed502f03SStefano Zampini         if (AT) {
4763ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4764ed502f03SStefano Zampini           thrust::advance(rT, -1);
4765ed502f03SStefano Zampini         }
4766ed502f03SStefano Zampini         if (BT) {
4767ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4768ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4769ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4770ed502f03SStefano Zampini         }
4771ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4772ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4773ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4774ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4775ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4776ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
47779566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4778ed502f03SStefano Zampini 
47799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
47809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
47819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
47829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
47839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
47849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
47859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4788ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47899371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47909371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4791ed502f03SStefano Zampini #endif
4792ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4793ed502f03SStefano Zampini       }
4794ed502f03SStefano Zampini     }
4795ed502f03SStefano Zampini 
4796ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4797ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4798ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
47999566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
48009566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
4801ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4802ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4803ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4804ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4805ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
48069566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
48079566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4808ed502f03SStefano Zampini     } else {
48099566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
48109566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4811ed502f03SStefano Zampini     }
48129566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
48139566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
48149566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4815ed502f03SStefano Zampini     c->maxnz         = c->nz;
4816ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4817ed502f03SStefano Zampini     c->rmax          = 0;
4818ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4819ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4820ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4821ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4822ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4823ed502f03SStefano Zampini     }
48249566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
48259566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4826ed502f03SStefano Zampini     (*C)->nonzerostate++;
48279566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
48289566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4829ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4830ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4831ed502f03SStefano Zampini   } else {
483208401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4833ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4834ed502f03SStefano Zampini     if (c->nz) {
4835ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
48365f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4837aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
483808401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
48399566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
48409566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
48415f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
48425f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4843ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4844ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4845ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4846aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4847aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4848aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4849aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
48505f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4851ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4852ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
48539566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48549371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
48559371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4856ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
48579371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
48589371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4859ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
48609566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
48611a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
48625f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4863ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4864ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4865ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4866ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4867ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4868ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4869ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48701a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4871ed502f03SStefano Zampini       }
48729566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4873ed502f03SStefano Zampini     }
4874ed502f03SStefano Zampini   }
48759566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4876ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4877ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4878ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4879ed502f03SStefano Zampini   PetscFunctionReturn(0);
4880ed502f03SStefano Zampini }
4881c215019aSStefano Zampini 
4882d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4883d71ae5a4SJacob Faibussowitsch {
4884c215019aSStefano Zampini   bool               dmem;
4885c215019aSStefano Zampini   const PetscScalar *av;
4886c215019aSStefano Zampini 
4887c215019aSStefano Zampini   PetscFunctionBegin;
4888c215019aSStefano Zampini   dmem = isCudaMem(v);
48899566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4890c215019aSStefano Zampini   if (n && idx) {
4891c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4892c215019aSStefano Zampini     widx.assign(idx, idx + n);
48939566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4894c215019aSStefano Zampini 
4895c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4896c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4897c215019aSStefano Zampini     if (dmem) {
4898c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4899c215019aSStefano Zampini     } else {
4900c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4901c215019aSStefano Zampini       dv = w->data();
4902c215019aSStefano Zampini     }
4903c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4904c215019aSStefano Zampini 
4905c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4906c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4907c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
490848a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4909c215019aSStefano Zampini     delete w;
4910c215019aSStefano Zampini   } else {
49119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4912c215019aSStefano Zampini   }
49139566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
49149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4915c215019aSStefano Zampini   PetscFunctionReturn(0);
4916c215019aSStefano Zampini }
4917