xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
92e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
94c215019aSStefano Zampini 
959371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
96aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
976e111a19SKarl Rupp 
98ca45077fSPaul Mullowney   PetscFunctionBegin;
99ca45077fSPaul Mullowney   switch (op) {
1009371c9d4SSatish Balay   case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break;
1019371c9d4SSatish Balay   case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break;
1029371c9d4SSatish Balay   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
103ca45077fSPaul Mullowney   }
104ca45077fSPaul Mullowney   PetscFunctionReturn(0);
105ca45077fSPaul Mullowney }
1069ae82921SPaul Mullowney 
107e057df02SPaul Mullowney /*@
108e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
109e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
110aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
111e057df02SPaul Mullowney    Not Collective
112e057df02SPaul Mullowney 
113e057df02SPaul Mullowney    Input Parameters:
1148468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
11536d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1162692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
117e057df02SPaul Mullowney 
118e057df02SPaul Mullowney    Output Parameter:
119e057df02SPaul Mullowney 
120e057df02SPaul Mullowney    Level: intermediate
121e057df02SPaul Mullowney 
122db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
123e057df02SPaul Mullowney @*/
1249371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
125e057df02SPaul Mullowney   PetscFunctionBegin;
126e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
127cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
128e057df02SPaul Mullowney   PetscFunctionReturn(0);
129e057df02SPaul Mullowney }
130e057df02SPaul Mullowney 
1319371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) {
132365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
133365b711fSMark Adams 
134365b711fSMark Adams   PetscFunctionBegin;
135365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
136365b711fSMark Adams   PetscFunctionReturn(0);
137365b711fSMark Adams }
138365b711fSMark Adams 
139365b711fSMark Adams /*@
140365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
141365b711fSMark Adams 
142365b711fSMark Adams    Input Parameters:
143365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
144365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
145365b711fSMark Adams 
146365b711fSMark Adams    Output Parameter:
147365b711fSMark Adams 
148365b711fSMark Adams    Notes:
149365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
150365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
151365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
152365b711fSMark Adams 
153365b711fSMark Adams    Level: intermediate
154365b711fSMark Adams 
155db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
156365b711fSMark Adams @*/
1579371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) {
158365b711fSMark Adams   PetscFunctionBegin;
159365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
160cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
161365b711fSMark Adams   PetscFunctionReturn(0);
162365b711fSMark Adams }
163365b711fSMark Adams 
1649371c9d4SSatish Balay PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) {
165e6e9a74fSStefano Zampini   PetscFunctionBegin;
1661a2c6b5cSJunchao Zhang   switch (op) {
1671a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1681a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1699566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1701a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1711a2c6b5cSJunchao Zhang     break;
1729371c9d4SSatish Balay   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
173e6e9a74fSStefano Zampini   }
174e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
175e6e9a74fSStefano Zampini }
176e6e9a74fSStefano Zampini 
177bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
178bddcd29dSMark Adams 
1799371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
180bddcd29dSMark Adams   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
181bddcd29dSMark Adams   IS                  isrow = b->row, iscol = b->col;
182bddcd29dSMark Adams   PetscBool           row_identity, col_identity;
183365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
184bddcd29dSMark Adams 
185bddcd29dSMark Adams   PetscFunctionBegin;
1869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1879566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
188bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
189bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
1909566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
1919566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
192f93f8571SJunchao Zhang 
193365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
194f93f8571SJunchao Zhang     if (row_identity && col_identity) {
195bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
196bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
197bddcd29dSMark Adams     } else {
198bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
199bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
200365b711fSMark Adams     }
201f93f8571SJunchao Zhang   }
202bddcd29dSMark Adams   B->ops->matsolve          = NULL;
203bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
204bddcd29dSMark Adams 
205bddcd29dSMark Adams   /* get the triangular factors */
206*48a46eb9SPierre Jolivet   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
207bddcd29dSMark Adams   PetscFunctionReturn(0);
208bddcd29dSMark Adams }
209bddcd29dSMark Adams 
2109371c9d4SSatish Balay static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
211e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2129ae82921SPaul Mullowney   PetscBool                flg;
213a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2146e111a19SKarl Rupp 
2159ae82921SPaul Mullowney   PetscFunctionBegin;
216d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
2179ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2189371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2199566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
220afb2bd1cSJunchao Zhang 
2219371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2229566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2239566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2249566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
225afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2269371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
227afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
228ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
229aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
230a435da06SStefano Zampini #else
231aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
232a435da06SStefano Zampini #endif
2339371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
234aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
235afb2bd1cSJunchao Zhang 
2369371c9d4SSatish Balay     PetscCall(
2379371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
238aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
239afb2bd1cSJunchao Zhang #endif
2404c87dfd4SPaul Mullowney   }
241d0609cedSBarry Smith   PetscOptionsHeadEnd();
2429ae82921SPaul Mullowney   PetscFunctionReturn(0);
2439ae82921SPaul Mullowney }
2449ae82921SPaul Mullowney 
2459371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) {
2469ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
2479ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
2489ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
249aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
2509ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
2519ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
2529ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2539ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
2549ae82921SPaul Mullowney 
2559ae82921SPaul Mullowney   PetscFunctionBegin;
256cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
257c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2589ae82921SPaul Mullowney     try {
2599ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2609ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
261da79fbbcSStefano Zampini       if (!loTriFactor) {
2622cbc15d9SMark         PetscScalar *AALo;
2632cbc15d9SMark 
2649566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
2659ae82921SPaul Mullowney 
2669ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
2689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
2699ae82921SPaul Mullowney 
2709ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2719ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
2729ae82921SPaul Mullowney         AiLo[n]   = nzLower;
2739ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
2749ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
2759ae82921SPaul Mullowney         v         = aa;
2769ae82921SPaul Mullowney         vi        = aj;
2779ae82921SPaul Mullowney         offset    = 1;
2789ae82921SPaul Mullowney         rowOffset = 1;
2799ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
2809ae82921SPaul Mullowney           nz      = ai[i + 1] - ai[i];
281e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
2829ae82921SPaul Mullowney           AiLo[i] = rowOffset;
2839ae82921SPaul Mullowney           rowOffset += nz + 1;
2849ae82921SPaul Mullowney 
2859566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
2869566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
2879ae82921SPaul Mullowney 
2889ae82921SPaul Mullowney           offset += nz;
2899ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
2909ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
2919ae82921SPaul Mullowney           offset += 1;
2929ae82921SPaul Mullowney 
2939ae82921SPaul Mullowney           v += nz;
2949ae82921SPaul Mullowney           vi += nz;
2959ae82921SPaul Mullowney         }
2962205254eSKarl Rupp 
297aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
2989566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
299da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
300aa372e3fSPaul Mullowney         /* Create the matrix description */
3019566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3029566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3049566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
305afb2bd1cSJunchao Zhang #else
3069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
307afb2bd1cSJunchao Zhang #endif
3089566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
310aa372e3fSPaul Mullowney 
311aa372e3fSPaul Mullowney         /* set the operation */
312aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
313aa372e3fSPaul Mullowney 
314aa372e3fSPaul Mullowney         /* set the matrix */
315aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
316aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
317aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
318aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
319aa372e3fSPaul Mullowney 
320aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
321aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
322aa372e3fSPaul Mullowney 
323aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
324aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
325aa372e3fSPaul Mullowney 
326aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
327aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
328aa372e3fSPaul Mullowney 
329afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3309566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
331261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3339371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3349371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3359566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
336afb2bd1cSJunchao Zhang #endif
337afb2bd1cSJunchao Zhang 
338aa372e3fSPaul Mullowney         /* perform the solve analysis */
3399371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3409371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
3411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3429371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
343d49cd2b7SBarry Smith #else
3445f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
345afb2bd1cSJunchao Zhang #endif
3469566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3479566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
348aa372e3fSPaul Mullowney 
349da79fbbcSStefano Zampini         /* assign the pointer */
350aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
3512cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
3529566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3549566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
355da79fbbcSStefano Zampini       } else { /* update values only */
356*48a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
357da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3582cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
359da79fbbcSStefano Zampini         v                    = aa;
360da79fbbcSStefano Zampini         vi                   = aj;
361da79fbbcSStefano Zampini         offset               = 1;
362da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
363da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
3649566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
365da79fbbcSStefano Zampini           offset += nz;
3662cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
367da79fbbcSStefano Zampini           offset += 1;
368da79fbbcSStefano Zampini           v += nz;
369da79fbbcSStefano Zampini         }
3702cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
3719566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
372da79fbbcSStefano Zampini       }
3739371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3749ae82921SPaul Mullowney   }
3759ae82921SPaul Mullowney   PetscFunctionReturn(0);
3769ae82921SPaul Mullowney }
3779ae82921SPaul Mullowney 
3789371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) {
3799ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3809ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3819ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
382aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
3839ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
3849ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3859ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3869ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
3879ae82921SPaul Mullowney 
3889ae82921SPaul Mullowney   PetscFunctionBegin;
389cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
390c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3919ae82921SPaul Mullowney     try {
3929ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3939ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
394da79fbbcSStefano Zampini       if (!upTriFactor) {
3952cbc15d9SMark         PetscScalar *AAUp;
3962cbc15d9SMark 
3979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
3982cbc15d9SMark 
3999ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4009566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4019566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4029ae82921SPaul Mullowney 
4039ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4049ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4059ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4069ae82921SPaul Mullowney         offset  = nzUpper;
4079ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4089ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4099ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4109ae82921SPaul Mullowney 
411e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4129ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4139ae82921SPaul Mullowney 
414e057df02SPaul Mullowney           /* decrement the offset */
4159ae82921SPaul Mullowney           offset -= (nz + 1);
4169ae82921SPaul Mullowney 
417e057df02SPaul Mullowney           /* first, set the diagonal elements */
4189ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
41909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4209ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4219ae82921SPaul Mullowney 
4229566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4239566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4249ae82921SPaul Mullowney         }
4252205254eSKarl Rupp 
426aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4279566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
428da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4292205254eSKarl Rupp 
430aa372e3fSPaul Mullowney         /* Create the matrix description */
4319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
435afb2bd1cSJunchao Zhang #else
4369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
437afb2bd1cSJunchao Zhang #endif
4389566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
440aa372e3fSPaul Mullowney 
441aa372e3fSPaul Mullowney         /* set the operation */
442aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
443aa372e3fSPaul Mullowney 
444aa372e3fSPaul Mullowney         /* set the matrix */
445aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
446aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
447aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
448aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
449aa372e3fSPaul Mullowney 
450aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
451aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
452aa372e3fSPaul Mullowney 
453aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
454aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
455aa372e3fSPaul Mullowney 
456aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
457aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
458aa372e3fSPaul Mullowney 
459afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4609566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
461261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4621b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4639371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4649371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
4659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
466afb2bd1cSJunchao Zhang #endif
467afb2bd1cSJunchao Zhang 
468aa372e3fSPaul Mullowney         /* perform the solve analysis */
4699371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4709371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
4711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4729371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
473d49cd2b7SBarry Smith #else
4745f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
475afb2bd1cSJunchao Zhang #endif
4769566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4779566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
478aa372e3fSPaul Mullowney 
479da79fbbcSStefano Zampini         /* assign the pointer */
480aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
4812cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
4829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
4839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
4849566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
485da79fbbcSStefano Zampini       } else {
486*48a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
487da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
488da79fbbcSStefano Zampini         offset = nzUpper;
489da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
490da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
491da79fbbcSStefano Zampini 
492da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
493da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
494da79fbbcSStefano Zampini 
495da79fbbcSStefano Zampini           /* decrement the offset */
496da79fbbcSStefano Zampini           offset -= (nz + 1);
497da79fbbcSStefano Zampini 
498da79fbbcSStefano Zampini           /* first, set the diagonal elements */
4992cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5009566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
501da79fbbcSStefano Zampini         }
5022cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5039566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
504da79fbbcSStefano Zampini       }
5059371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
5069ae82921SPaul Mullowney   }
5079ae82921SPaul Mullowney   PetscFunctionReturn(0);
5089ae82921SPaul Mullowney }
5099ae82921SPaul Mullowney 
5109371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) {
5119ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5139ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5149ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5159ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5169ae82921SPaul Mullowney 
5179ae82921SPaul Mullowney   PetscFunctionBegin;
51828b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
5199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5212205254eSKarl Rupp 
522da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
523aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5249ae82921SPaul Mullowney 
525c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
526e057df02SPaul Mullowney   /* lower triangular indices */
5279566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
528da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
529da79fbbcSStefano Zampini     const PetscInt *r;
530da79fbbcSStefano Zampini 
5319566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
532aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
533aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5349566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5359566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
536da79fbbcSStefano Zampini   }
5379ae82921SPaul Mullowney 
538e057df02SPaul Mullowney   /* upper triangular indices */
5399566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
540da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
541da79fbbcSStefano Zampini     const PetscInt *c;
542da79fbbcSStefano Zampini 
5439566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
544aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
545aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
5469566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
5479566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
548da79fbbcSStefano Zampini   }
5499ae82921SPaul Mullowney   PetscFunctionReturn(0);
5509ae82921SPaul Mullowney }
5519ae82921SPaul Mullowney 
5529371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) {
553087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
554087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
555aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
556aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
557087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
558087f3262SPaul Mullowney   PetscScalar                       *AAUp;
559087f3262SPaul Mullowney   PetscScalar                       *AALo;
560087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
561087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
562087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
563087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
564087f3262SPaul Mullowney 
565087f3262SPaul Mullowney   PetscFunctionBegin;
566cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
567c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
568087f3262SPaul Mullowney     try {
5699566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
5709566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
571da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
572087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
5749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
575087f3262SPaul Mullowney 
576087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
577087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
578087f3262SPaul Mullowney         AiUp[n] = nzUpper;
579087f3262SPaul Mullowney         offset  = 0;
580087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
581087f3262SPaul Mullowney           /* set the pointers */
582087f3262SPaul Mullowney           v  = aa + ai[i];
583087f3262SPaul Mullowney           vj = aj + ai[i];
584087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
585087f3262SPaul Mullowney 
586087f3262SPaul Mullowney           /* first, set the diagonal elements */
587087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
58809f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
589087f3262SPaul Mullowney           AiUp[i]      = offset;
59009f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
591087f3262SPaul Mullowney 
592087f3262SPaul Mullowney           offset += 1;
593087f3262SPaul Mullowney           if (nz > 0) {
5949566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
5959566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
596087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
597087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
598087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
599087f3262SPaul Mullowney             }
600087f3262SPaul Mullowney             offset += nz;
601087f3262SPaul Mullowney           }
602087f3262SPaul Mullowney         }
603087f3262SPaul Mullowney 
604aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6059566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
606da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
607087f3262SPaul Mullowney 
608aa372e3fSPaul Mullowney         /* Create the matrix description */
6099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6109566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6129566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
613afb2bd1cSJunchao Zhang #else
6149566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
615afb2bd1cSJunchao Zhang #endif
6169566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
618087f3262SPaul Mullowney 
619aa372e3fSPaul Mullowney         /* set the matrix */
620aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
621aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
622aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
623aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
624aa372e3fSPaul Mullowney 
625aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
626aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
627aa372e3fSPaul Mullowney 
628aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
629aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
630aa372e3fSPaul Mullowney 
631aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
632aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
633aa372e3fSPaul Mullowney 
634afb2bd1cSJunchao Zhang         /* set the operation */
635afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
636afb2bd1cSJunchao Zhang 
637afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6389566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
639261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6419371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6429371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
6439566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
644afb2bd1cSJunchao Zhang #endif
645afb2bd1cSJunchao Zhang 
646aa372e3fSPaul Mullowney         /* perform the solve analysis */
6479371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6489371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
6491b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6509371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
651d49cd2b7SBarry Smith #else
6525f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
653afb2bd1cSJunchao Zhang #endif
6549566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6559566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
656aa372e3fSPaul Mullowney 
657da79fbbcSStefano Zampini         /* assign the pointer */
658aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
659aa372e3fSPaul Mullowney 
660aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6619566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
662da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
663aa372e3fSPaul Mullowney 
664aa372e3fSPaul Mullowney         /* Create the matrix description */
6659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
6669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
669afb2bd1cSJunchao Zhang #else
6709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
671afb2bd1cSJunchao Zhang #endif
6729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
674aa372e3fSPaul Mullowney 
675aa372e3fSPaul Mullowney         /* set the operation */
676aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
677aa372e3fSPaul Mullowney 
678aa372e3fSPaul Mullowney         /* set the matrix */
679aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
680aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
681aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
682aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
683aa372e3fSPaul Mullowney 
684aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
685aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
686aa372e3fSPaul Mullowney 
687aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
688aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
689aa372e3fSPaul Mullowney 
690aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
691aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
692aa372e3fSPaul Mullowney 
693afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6949566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
695261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
6961b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6979371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
6989371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
6999566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
700afb2bd1cSJunchao Zhang #endif
701afb2bd1cSJunchao Zhang 
702aa372e3fSPaul Mullowney         /* perform the solve analysis */
7039371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7049371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
7051b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7069371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
707d49cd2b7SBarry Smith #else
7085f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
709afb2bd1cSJunchao Zhang #endif
7109566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7119566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
712aa372e3fSPaul Mullowney 
713da79fbbcSStefano Zampini         /* assign the pointer */
714aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
715087f3262SPaul Mullowney 
7169566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
7179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7189566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
719da79fbbcSStefano Zampini       } else {
720da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
721da79fbbcSStefano Zampini         offset = 0;
722da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
723da79fbbcSStefano Zampini           /* set the pointers */
724da79fbbcSStefano Zampini           v  = aa + ai[i];
725da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
726da79fbbcSStefano Zampini 
727da79fbbcSStefano Zampini           /* first, set the diagonal elements */
728da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
729da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
730da79fbbcSStefano Zampini 
731da79fbbcSStefano Zampini           offset += 1;
732da79fbbcSStefano Zampini           if (nz > 0) {
7339566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
734da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
735da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
736da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
737da79fbbcSStefano Zampini             }
738da79fbbcSStefano Zampini             offset += nz;
739da79fbbcSStefano Zampini           }
740da79fbbcSStefano Zampini         }
74128b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
74228b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
743da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
744da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
7459566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
746da79fbbcSStefano Zampini       }
7479566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
7489566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
7499371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
750087f3262SPaul Mullowney   }
751087f3262SPaul Mullowney   PetscFunctionReturn(0);
752087f3262SPaul Mullowney }
753087f3262SPaul Mullowney 
7549371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) {
755087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
756087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
757087f3262SPaul Mullowney   IS                            ip                 = a->row;
758087f3262SPaul Mullowney   PetscBool                     perm_identity;
759087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
760087f3262SPaul Mullowney 
761087f3262SPaul Mullowney   PetscFunctionBegin;
76228b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
7639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
764da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
765aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
766aa372e3fSPaul Mullowney 
767da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
768da79fbbcSStefano Zampini 
769087f3262SPaul Mullowney   /* lower triangular indices */
7709566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
771087f3262SPaul Mullowney   if (!perm_identity) {
7724e4bbfaaSStefano Zampini     IS              iip;
773da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
7744e4bbfaaSStefano Zampini 
7759566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
7769566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
7779566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
778aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
779aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
780aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7814e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
7829566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
7839566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
7849566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
7859566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
786da79fbbcSStefano Zampini   }
787087f3262SPaul Mullowney   PetscFunctionReturn(0);
788087f3262SPaul Mullowney }
789087f3262SPaul Mullowney 
7909371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
791087f3262SPaul Mullowney   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
792087f3262SPaul Mullowney   IS          ip = b->row;
793087f3262SPaul Mullowney   PetscBool   perm_identity;
794087f3262SPaul Mullowney 
795087f3262SPaul Mullowney   PetscFunctionBegin;
7969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
7979566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
798ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
799087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8009566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
801087f3262SPaul Mullowney   if (perm_identity) {
802087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
803087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8044e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8054e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
806087f3262SPaul Mullowney   } else {
807087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
808087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
8094e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8104e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
811087f3262SPaul Mullowney   }
812087f3262SPaul Mullowney 
813087f3262SPaul Mullowney   /* get the triangular factors */
8149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
815087f3262SPaul Mullowney   PetscFunctionReturn(0);
816087f3262SPaul Mullowney }
8179ae82921SPaul Mullowney 
8189371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) {
819bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
820aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
821aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
822da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
823da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
824aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
825aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
826aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
827aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
828b175d8bbSPaul Mullowney 
829bda325fcSPaul Mullowney   PetscFunctionBegin;
830aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8319566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
832da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
833aa372e3fSPaul Mullowney 
834aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
835aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
836aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
8379371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
838aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
839aa372e3fSPaul Mullowney 
840aa372e3fSPaul Mullowney   /* Create the matrix description */
8419566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
8429566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
8439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
8449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
8459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney   /* set the operation */
848aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
851aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
852afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
853afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
854aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
855afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
856afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
857afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
858aa372e3fSPaul Mullowney 
859aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
860afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8619371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
8629371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
8639371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
8649566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
865afb2bd1cSJunchao Zhang #endif
866afb2bd1cSJunchao Zhang 
8679566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
8689371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
8699371c9d4SSatish Balay                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
870afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8719371c9d4SSatish Balay                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
872afb2bd1cSJunchao Zhang #else
8739371c9d4SSatish Balay                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
874afb2bd1cSJunchao Zhang #endif
8759566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8769566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
877aa372e3fSPaul Mullowney 
878afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
8799566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
880261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
8811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8829371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
8839371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
8849566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
885afb2bd1cSJunchao Zhang #endif
886afb2bd1cSJunchao Zhang 
887afb2bd1cSJunchao Zhang   /* perform the solve analysis */
8889371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
8899371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
8901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8919371c9d4SSatish Balay                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
892d49cd2b7SBarry Smith #else
8935f80ce2aSJacob Faibussowitsch                                             loTriFactorT->solveInfo));
894afb2bd1cSJunchao Zhang #endif
8959566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8969566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
897aa372e3fSPaul Mullowney 
898da79fbbcSStefano Zampini   /* assign the pointer */
899aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
900aa372e3fSPaul Mullowney 
901aa372e3fSPaul Mullowney   /*********************************************/
902aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
903aa372e3fSPaul Mullowney   /*********************************************/
904aa372e3fSPaul Mullowney 
905aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9069566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
907da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
908aa372e3fSPaul Mullowney 
909aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
910aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
911aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
9129371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
913aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
914aa372e3fSPaul Mullowney 
915aa372e3fSPaul Mullowney   /* Create the matrix description */
9169566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9179566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
9189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
9199566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
9209566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
921aa372e3fSPaul Mullowney 
922aa372e3fSPaul Mullowney   /* set the operation */
923aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
924aa372e3fSPaul Mullowney 
925aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
926aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
927afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
928afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
929aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
930afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
931afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
932afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
933aa372e3fSPaul Mullowney 
934aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
935afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9369371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
9379371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
9389371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
9399566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
940afb2bd1cSJunchao Zhang #endif
941afb2bd1cSJunchao Zhang 
9429566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9439371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
9449371c9d4SSatish Balay                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
945afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9469371c9d4SSatish Balay                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
947afb2bd1cSJunchao Zhang #else
9489371c9d4SSatish Balay                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
949afb2bd1cSJunchao Zhang #endif
950d49cd2b7SBarry Smith 
9519566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9529566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
953aa372e3fSPaul Mullowney 
954afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9559566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
956261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
9571b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9589371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9599371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
9609566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
961afb2bd1cSJunchao Zhang #endif
962afb2bd1cSJunchao Zhang 
963afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9645f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
9659371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9669371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
9671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9689371c9d4SSatish Balay                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
969d49cd2b7SBarry Smith #else
9705f80ce2aSJacob Faibussowitsch                                             upTriFactorT->solveInfo));
971afb2bd1cSJunchao Zhang #endif
972d49cd2b7SBarry Smith 
9739566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9749566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
975aa372e3fSPaul Mullowney 
976da79fbbcSStefano Zampini   /* assign the pointer */
977aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
978bda325fcSPaul Mullowney   PetscFunctionReturn(0);
979bda325fcSPaul Mullowney }
980bda325fcSPaul Mullowney 
9819371c9d4SSatish Balay struct PetscScalarToPetscInt {
9829371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
983a49f1ed0SStefano Zampini };
984a49f1ed0SStefano Zampini 
9859371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) {
986aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
987a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
988bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
989bda325fcSPaul Mullowney   cusparseStatus_t              stat;
990aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
991b175d8bbSPaul Mullowney 
992bda325fcSPaul Mullowney   PetscFunctionBegin;
9939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
994a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
99528b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
996a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
99708401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
9981a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
9999566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1001*48a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1002a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1003aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
10049566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1005aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
10069566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
10079566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1008aa372e3fSPaul Mullowney 
1009b06137fdSPaul Mullowney     /* set alpha and beta */
10109566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
10119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
10129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
10139566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1016b06137fdSPaul Mullowney 
1017aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1018aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1019a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1020554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1021554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1022aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1023a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1024aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1025aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1026a3fdcf43SKarl Rupp 
1027039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); }
102881902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1029afb2bd1cSJunchao Zhang 
1030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10313606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
10329371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
10339371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
10349371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
10353606e59fSJunchao Zhang #else
10363606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
10373606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
10383606e59fSJunchao Zhang 
10393606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
10403606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
10413606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
10423606e59fSJunchao Zhang         */
10433606e59fSJunchao Zhang       if (matrixT->num_entries) {
10449371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
10459371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
10463606e59fSJunchao Zhang 
10473606e59fSJunchao Zhang       } else {
10483606e59fSJunchao Zhang         matstructT->matDescr = NULL;
10493606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
10503606e59fSJunchao Zhang       }
10513606e59fSJunchao Zhang #endif
1052afb2bd1cSJunchao Zhang #endif
1053aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1054afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1055afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1056afb2bd1cSJunchao Zhang #else
1057aa372e3fSPaul Mullowney       CsrMatrix *temp = new CsrMatrix;
105851c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
105951c6d536SStefano Zampini       /* First convert HYB to CSR */
1060aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1061aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1062aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1063aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1064aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1065aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1066aa372e3fSPaul Mullowney 
10679371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
10689371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1069aa372e3fSPaul Mullowney 
1070aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1071aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1072aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1073aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1074aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1075aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1076aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1077aa372e3fSPaul Mullowney 
10789371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
10799371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
10809371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1081aa372e3fSPaul Mullowney 
1082aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1083aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
10849566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
10859371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
10869371c9d4SSatish Balay       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
10879371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney       /* assign the pointer */
1090aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
10911a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1092aa372e3fSPaul Mullowney       /* delete temporaries */
1093aa372e3fSPaul Mullowney       if (tempT) {
1094aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1095aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1096aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1097aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1098087f3262SPaul Mullowney       }
1099aa372e3fSPaul Mullowney       if (temp) {
1100aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1101aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1102aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1103aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1104aa372e3fSPaul Mullowney       }
1105afb2bd1cSJunchao Zhang #endif
1106aa372e3fSPaul Mullowney     }
1107a49f1ed0SStefano Zampini   }
1108a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1109a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1110a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
111128b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
111228b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
111328b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
111428b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
111528b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
111628b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
111728b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
111828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1119a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1120a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1121a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
11229566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1123a49f1ed0SStefano Zampini     }
1124a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1125a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1126792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1127a49f1ed0SStefano Zampini 
1128a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1129a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1130a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1131a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
11329371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
11339371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
11349371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
11359566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1136a49f1ed0SStefano Zampini #endif
1137a49f1ed0SStefano Zampini 
11381a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
11391a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
11401a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
11411a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
11421a2c6b5cSJunchao Zhang 
11431a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
11441a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
11451a2c6b5cSJunchao Zhang         */
11469371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1147a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11489371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
11499371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1150a49f1ed0SStefano Zampini #else
11519371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11529371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1153a49f1ed0SStefano Zampini #endif
11541a2c6b5cSJunchao Zhang       } else {
11551a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
11561a2c6b5cSJunchao Zhang       }
11571a2c6b5cSJunchao Zhang 
1158a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1159792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1160a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11619566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1162a49f1ed0SStefano Zampini #endif
1163a49f1ed0SStefano Zampini     }
11649371c9d4SSatish Balay     PetscCallThrust(
11659371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1166a49f1ed0SStefano Zampini   }
11679566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
11689566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1169213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1170213423ffSJunchao Zhang   matstructT->cprowIndices                       = NULL;
1171aa372e3fSPaul Mullowney   /* assign the pointer */
1172aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
11731a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
1174bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1175bda325fcSPaul Mullowney }
1176bda325fcSPaul Mullowney 
1177a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
11789371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1179c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1180465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1181465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1182465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1183465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1184bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1185bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1186aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1187aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1188aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1189bda325fcSPaul Mullowney 
1190bda325fcSPaul Mullowney   PetscFunctionBegin;
1191aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1192aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
11939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1194aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1195aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1196bda325fcSPaul Mullowney   }
1197bda325fcSPaul Mullowney 
1198bda325fcSPaul Mullowney   /* Get the GPU pointers */
11999566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12009566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1201c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1202c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1203bda325fcSPaul Mullowney 
12049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1205aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12069371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1207aa372e3fSPaul Mullowney 
1208aa372e3fSPaul Mullowney   /* First, solve U */
12099371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
12101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1211afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1212afb2bd1cSJunchao Zhang #endif
12139371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
12141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12159371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
12169371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1217d49cd2b7SBarry Smith #else
12189371c9d4SSatish Balay                               tempGPU->data().get());
12199371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1220afb2bd1cSJunchao Zhang #endif
1221aa372e3fSPaul Mullowney 
1222aa372e3fSPaul Mullowney   /* Then, solve L */
12239371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
12241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1225afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1226afb2bd1cSJunchao Zhang #endif
12279371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1228d49cd2b7SBarry Smith                               tempGPU->data().get(),
12291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12309371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
12319371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1232d49cd2b7SBarry Smith #else
12339371c9d4SSatish Balay                               xarray);
12349371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1235afb2bd1cSJunchao Zhang #endif
1236aa372e3fSPaul Mullowney 
1237aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
12389371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1239aa372e3fSPaul Mullowney 
1240aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1241a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1242bda325fcSPaul Mullowney 
1243bda325fcSPaul Mullowney   /* restore */
12449566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12459566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12469566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12479566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1248bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1249bda325fcSPaul Mullowney }
1250bda325fcSPaul Mullowney 
12519371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1252465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1253465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1254bda325fcSPaul Mullowney   cusparseStatus_t                   stat;
1255bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1256aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1257aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1258aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1259bda325fcSPaul Mullowney 
1260bda325fcSPaul Mullowney   PetscFunctionBegin;
1261aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1262aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1264aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1265aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1266bda325fcSPaul Mullowney   }
1267bda325fcSPaul Mullowney 
1268bda325fcSPaul Mullowney   /* Get the GPU pointers */
12699566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12709566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1271bda325fcSPaul Mullowney 
12729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1273aa372e3fSPaul Mullowney   /* First, solve U */
12749371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
12751b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1276afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1277afb2bd1cSJunchao Zhang #endif
12789371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
12791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12809371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
12819371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1282d49cd2b7SBarry Smith #else
12839371c9d4SSatish Balay                               tempGPU->data().get());
12849371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1285afb2bd1cSJunchao Zhang #endif
1286aa372e3fSPaul Mullowney 
1287aa372e3fSPaul Mullowney   /* Then, solve L */
12889371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
12891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1290afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1291afb2bd1cSJunchao Zhang #endif
12929371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1293d49cd2b7SBarry Smith                               tempGPU->data().get(),
12941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12959371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
12969371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1297d49cd2b7SBarry Smith #else
12989371c9d4SSatish Balay                               xarray);
12999371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1300afb2bd1cSJunchao Zhang #endif
1301bda325fcSPaul Mullowney 
1302bda325fcSPaul Mullowney   /* restore */
13039566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13049566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13069566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1307bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1308bda325fcSPaul Mullowney }
1309bda325fcSPaul Mullowney 
13109371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1311465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1312465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1313465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1314465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
13159ae82921SPaul Mullowney   cusparseStatus_t                      stat;
13169ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1317aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1318aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1319aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13209ae82921SPaul Mullowney 
13219ae82921SPaul Mullowney   PetscFunctionBegin;
1322ebc8f436SDominic Meiser 
1323e057df02SPaul Mullowney   /* Get the GPU pointers */
13249566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13259566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1326c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1327c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
13289ae82921SPaul Mullowney 
13299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1330aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
13319371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1332aa372e3fSPaul Mullowney 
1333aa372e3fSPaul Mullowney   /* Next, solve L */
13349371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
13351b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1336afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1337afb2bd1cSJunchao Zhang #endif
13389371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1339d49cd2b7SBarry Smith                               tempGPU->data().get(),
13401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13419371c9d4SSatish Balay                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
13429371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1343d49cd2b7SBarry Smith #else
13449371c9d4SSatish Balay                               xarray);
13459371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1346afb2bd1cSJunchao Zhang #endif
1347aa372e3fSPaul Mullowney 
1348aa372e3fSPaul Mullowney   /* Then, solve U */
13499371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
13501b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1351afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1352afb2bd1cSJunchao Zhang #endif
13539371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
13541b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13559371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
13569371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1357d49cd2b7SBarry Smith #else
13589371c9d4SSatish Balay                               tempGPU->data().get());
13599371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1360afb2bd1cSJunchao Zhang #endif
1361d49cd2b7SBarry Smith 
13624e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
13639371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
13649ae82921SPaul Mullowney 
13659566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13669566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13679566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13699ae82921SPaul Mullowney   PetscFunctionReturn(0);
13709ae82921SPaul Mullowney }
13719ae82921SPaul Mullowney 
13729371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1373465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1374465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
13759ae82921SPaul Mullowney   cusparseStatus_t                   stat;
13769ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1377aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1378aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1379aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13809ae82921SPaul Mullowney 
13819ae82921SPaul Mullowney   PetscFunctionBegin;
1382e057df02SPaul Mullowney   /* Get the GPU pointers */
13839566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13849566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
13859ae82921SPaul Mullowney 
13869566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1387aa372e3fSPaul Mullowney   /* First, solve L */
13889371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
13891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1390afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1391afb2bd1cSJunchao Zhang #endif
13929371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
13931b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13949371c9d4SSatish Balay                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
13959371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1396d49cd2b7SBarry Smith #else
13979371c9d4SSatish Balay                               tempGPU->data().get());
13989371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1399afb2bd1cSJunchao Zhang #endif
1400d49cd2b7SBarry Smith 
1401aa372e3fSPaul Mullowney   /* Next, solve U */
14029371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
14031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1404afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1405afb2bd1cSJunchao Zhang #endif
14069371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1407d49cd2b7SBarry Smith                               tempGPU->data().get(),
14081b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
14099371c9d4SSatish Balay                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
14109371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1411d49cd2b7SBarry Smith #else
14129371c9d4SSatish Balay                               xarray);
14139371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1414afb2bd1cSJunchao Zhang #endif
14159ae82921SPaul Mullowney 
14169566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
14179566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
14189566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
14209ae82921SPaul Mullowney   PetscFunctionReturn(0);
14219ae82921SPaul Mullowney }
14229ae82921SPaul Mullowney 
1423da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1424da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
14259371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1426da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1427da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1428da112707SJunchao Zhang   const PetscScalar            *barray;
1429da112707SJunchao Zhang   PetscScalar                  *xarray;
1430da112707SJunchao Zhang 
1431da112707SJunchao Zhang   PetscFunctionBegin;
1432da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1433da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1434da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1435da112707SJunchao Zhang 
1436da112707SJunchao Zhang   /* Solve L*y = b */
1437da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1438da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14399371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
14409371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
144112ba2bc6SJunchao Zhang                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1442da112707SJunchao Zhang 
1443da112707SJunchao Zhang   /* Solve U*x = y */
1444da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14459371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
14469371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1447da112707SJunchao Zhang 
1448da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1449da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1450da112707SJunchao Zhang 
1451da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1452da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1453da112707SJunchao Zhang   PetscFunctionReturn(0);
1454da112707SJunchao Zhang }
1455da112707SJunchao Zhang 
14569371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1457da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1458da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1459da112707SJunchao Zhang   const PetscScalar            *barray;
1460da112707SJunchao Zhang   PetscScalar                  *xarray;
1461da112707SJunchao Zhang 
1462da112707SJunchao Zhang   PetscFunctionBegin;
146312ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1464da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
14659371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
14669371c9d4SSatish Balay                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1467da112707SJunchao Zhang 
1468da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
14699371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1470da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
147112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
147212ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
147312ba2bc6SJunchao Zhang   }
1474da112707SJunchao Zhang 
147512ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
14769371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1477da112707SJunchao Zhang 
14789371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
147912ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1480da112707SJunchao Zhang   }
1481da112707SJunchao Zhang 
1482da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1483da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1484da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1485da112707SJunchao Zhang 
1486da112707SJunchao Zhang   /* Solve Ut*y = b */
1487da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1488da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14899371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
14909371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1491da112707SJunchao Zhang 
1492da112707SJunchao Zhang   /* Solve Lt*x = y */
1493da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14949371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
14959371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1496da112707SJunchao Zhang 
1497da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1498da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1499da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1500da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1501da112707SJunchao Zhang   PetscFunctionReturn(0);
1502da112707SJunchao Zhang }
1503da112707SJunchao Zhang 
15049371c9d4SSatish Balay static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) {
1505da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1506da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1507da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1508da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1509da112707SJunchao Zhang   PetscInt                      m, nz;
1510da112707SJunchao Zhang   PetscBool                     flg;
1511da112707SJunchao Zhang 
1512da112707SJunchao Zhang   PetscFunctionBegin;
1513da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1514da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1515da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1516da112707SJunchao Zhang   }
1517da112707SJunchao Zhang 
1518da112707SJunchao Zhang   /* Copy A's value to fact */
1519da112707SJunchao Zhang   m  = fact->rmap->n;
1520da112707SJunchao Zhang   nz = aij->nz;
1521da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1522da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1523da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1524da112707SJunchao Zhang 
1525da112707SJunchao Zhang   /* Factorize fact inplace */
15269371c9d4SSatish Balay   if (m)
15279371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
15289371c9d4SSatish Balay                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1529da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1530da112707SJunchao Zhang     int              numerical_zero;
1531da112707SJunchao Zhang     cusparseStatus_t status;
1532da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1533da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1534da112707SJunchao Zhang   }
1535da112707SJunchao Zhang 
153612ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
153712ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
153812ba2bc6SJunchao Zhang   */
15399371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1540da112707SJunchao Zhang 
15419371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1542da112707SJunchao Zhang 
154312ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
154412ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
154512ba2bc6SJunchao Zhang 
1546da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1547da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1548da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1549da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1550da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1551da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1552da112707SJunchao Zhang   PetscFunctionReturn(0);
1553da112707SJunchao Zhang }
1554da112707SJunchao Zhang 
15559371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1556da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1557da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1558da112707SJunchao Zhang   PetscInt                      m, nz;
1559da112707SJunchao Zhang 
1560da112707SJunchao Zhang   PetscFunctionBegin;
1561da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1562da112707SJunchao Zhang     PetscInt  i;
1563da112707SJunchao Zhang     PetscBool flg, missing;
1564da112707SJunchao Zhang 
1565da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1566da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1567da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1568da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1569da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1570da112707SJunchao Zhang   }
1571da112707SJunchao Zhang 
1572da112707SJunchao Zhang   /* Free the old stale stuff */
1573da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1574da112707SJunchao Zhang 
1575da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1576da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1577da112707SJunchao Zhang    */
1578da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1579da112707SJunchao Zhang 
1580da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1581da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1582da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1583da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1584da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1585da112707SJunchao Zhang 
1586da112707SJunchao Zhang   aij->row = NULL;
1587da112707SJunchao Zhang   aij->col = NULL;
1588da112707SJunchao Zhang 
1589da112707SJunchao Zhang   /* ====================================================================== */
1590da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1591da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1592da112707SJunchao Zhang   /* ====================================================================== */
1593da112707SJunchao Zhang   const int *Ai, *Aj;
1594da112707SJunchao Zhang 
1595da112707SJunchao Zhang   m  = fact->rmap->n;
1596da112707SJunchao Zhang   nz = aij->nz;
1597da112707SJunchao Zhang 
1598da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1599da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1600da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1601da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1602da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1603da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1604da112707SJunchao Zhang 
1605da112707SJunchao Zhang   /* ====================================================================== */
1606da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1607da112707SJunchao Zhang   /* ====================================================================== */
1608da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1609da112707SJunchao Zhang   cusparseDiagType_t diagType;
1610da112707SJunchao Zhang 
1611da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1612da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1613da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1614da112707SJunchao Zhang 
1615da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1616da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1617da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1618da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1619da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1620da112707SJunchao Zhang   */
1621da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1622da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
16239371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16249371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1626da112707SJunchao Zhang 
1627da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1628da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
16299371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16309371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16319371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1632da112707SJunchao Zhang 
1633da112707SJunchao Zhang   /* ========================================================================= */
1634da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1635da112707SJunchao Zhang   /* ========================================================================= */
1636da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
16379371c9d4SSatish Balay   if (m)
16389371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16399371c9d4SSatish Balay                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1640da112707SJunchao Zhang 
1641da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1642da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1643da112707SJunchao Zhang 
1644da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1645da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1646da112707SJunchao Zhang 
1647da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
16489371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1649da112707SJunchao Zhang 
1650da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
16519371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1652da112707SJunchao Zhang 
1653da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
165412ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
165512ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
165612ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1657da112707SJunchao Zhang    */
165812ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
165912ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
166012ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1661da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
166212ba2bc6SJunchao Zhang   } else {
166312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
166412ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1665da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
166612ba2bc6SJunchao Zhang   }
1667da112707SJunchao Zhang 
1668da112707SJunchao Zhang   /* ========================================================================== */
1669da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1670da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1671da112707SJunchao Zhang   /* ========================================================================== */
1672da112707SJunchao Zhang   int              structural_zero;
1673da112707SJunchao Zhang   cusparseStatus_t status;
1674da112707SJunchao Zhang 
1675da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
16769371c9d4SSatish Balay   if (m)
16779371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16789371c9d4SSatish Balay                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1679da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1680da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1681da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1682da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1683da112707SJunchao Zhang   }
1684da112707SJunchao Zhang 
1685da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
16860dd8c0acSJunchao Zhang   {
1687da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
16880dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1689da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1690da112707SJunchao Zhang 
1691da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1692da112707SJunchao Zhang     Ai    = Aseq->i;
1693da112707SJunchao Zhang     Adiag = Aseq->diag;
1694da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1695da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1696da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1697da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1698da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1699da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1700da112707SJunchao Zhang         */
1701da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1702da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1703da112707SJunchao Zhang       }
1704da112707SJunchao Zhang     }
1705da112707SJunchao Zhang     fs->numericFactFlops = flops;
17060dd8c0acSJunchao Zhang   }
1707da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1708da112707SJunchao Zhang   PetscFunctionReturn(0);
1709da112707SJunchao Zhang }
1710da112707SJunchao Zhang 
17119371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) {
1712da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1713da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1714da112707SJunchao Zhang   const PetscScalar            *barray;
1715da112707SJunchao Zhang   PetscScalar                  *xarray;
1716da112707SJunchao Zhang 
1717da112707SJunchao Zhang   PetscFunctionBegin;
1718da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1719da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1720da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1721da112707SJunchao Zhang 
1722da112707SJunchao Zhang   /* Solve L*y = b */
1723da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1724da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
17259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
17269371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1727da112707SJunchao Zhang 
1728da112707SJunchao Zhang   /* Solve Lt*x = y */
1729da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
17309371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
17319371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1732da112707SJunchao Zhang 
1733da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1734da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1735da112707SJunchao Zhang 
1736da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1737da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1738da112707SJunchao Zhang   PetscFunctionReturn(0);
1739da112707SJunchao Zhang }
1740da112707SJunchao Zhang 
17419371c9d4SSatish Balay static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) {
1742da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1743da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1744da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1745da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1746da112707SJunchao Zhang   PetscInt                      m, nz;
1747da112707SJunchao Zhang   PetscBool                     flg;
1748da112707SJunchao Zhang 
1749da112707SJunchao Zhang   PetscFunctionBegin;
1750da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1751da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1752da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1753da112707SJunchao Zhang   }
1754da112707SJunchao Zhang 
1755da112707SJunchao Zhang   /* Copy A's value to fact */
1756da112707SJunchao Zhang   m  = fact->rmap->n;
1757da112707SJunchao Zhang   nz = aij->nz;
1758da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1759da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1760da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1761da112707SJunchao Zhang 
1762da112707SJunchao Zhang   /* Factorize fact inplace */
1763da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1764da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1765da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1766da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1767da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1768da112707SJunchao Zhang    */
17699371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1770da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1771da112707SJunchao Zhang     int              numerical_zero;
1772da112707SJunchao Zhang     cusparseStatus_t status;
1773da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1774da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1775da112707SJunchao Zhang   }
1776da112707SJunchao Zhang 
17779371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1778da112707SJunchao Zhang 
1779da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1780da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1781da112707SJunchao Zhang   */
17829371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1783da112707SJunchao Zhang 
1784da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1785da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1786da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1787da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1788da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1789da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1790da112707SJunchao Zhang   PetscFunctionReturn(0);
1791da112707SJunchao Zhang }
1792da112707SJunchao Zhang 
17939371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1794da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1795da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1796da112707SJunchao Zhang   PetscInt                      m, nz;
1797da112707SJunchao Zhang 
1798da112707SJunchao Zhang   PetscFunctionBegin;
1799da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1800da112707SJunchao Zhang     PetscInt  i;
1801da112707SJunchao Zhang     PetscBool flg, missing;
1802da112707SJunchao Zhang 
1803da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1804da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1805da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1806da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1807da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1808da112707SJunchao Zhang   }
1809da112707SJunchao Zhang 
1810da112707SJunchao Zhang   /* Free the old stale stuff */
1811da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1812da112707SJunchao Zhang 
1813da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1814da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1815da112707SJunchao Zhang    */
1816da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1817da112707SJunchao Zhang 
1818da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1819da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
1820da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1821da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1822da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1823da112707SJunchao Zhang 
1824da112707SJunchao Zhang   aij->row = NULL;
1825da112707SJunchao Zhang   aij->col = NULL;
1826da112707SJunchao Zhang 
1827da112707SJunchao Zhang   /* ====================================================================== */
1828da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1829da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1830da112707SJunchao Zhang   /* ====================================================================== */
1831da112707SJunchao Zhang   const int *Ai, *Aj;
1832da112707SJunchao Zhang 
1833da112707SJunchao Zhang   m  = fact->rmap->n;
1834da112707SJunchao Zhang   nz = aij->nz;
1835da112707SJunchao Zhang 
1836da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1837da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1838da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1839da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1840da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1841da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1842da112707SJunchao Zhang 
1843da112707SJunchao Zhang   /* ====================================================================== */
1844da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
1845da112707SJunchao Zhang   /* ====================================================================== */
1846da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1847da112707SJunchao Zhang   cusparseDiagType_t diagType;
1848da112707SJunchao Zhang 
1849da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1850da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1851da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1852da112707SJunchao Zhang 
1853da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1854da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1855da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1856da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1857da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1858da112707SJunchao Zhang   */
1859da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1860da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
18619371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18629371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18639371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1864da112707SJunchao Zhang 
1865da112707SJunchao Zhang   /* ========================================================================= */
1866da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1867da112707SJunchao Zhang   /* ========================================================================= */
1868da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
18699371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1870da112707SJunchao Zhang 
1871da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1872da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1873da112707SJunchao Zhang 
1874da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1875da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1876da112707SJunchao Zhang 
1877da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1879da112707SJunchao Zhang 
1880da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
18819371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1882da112707SJunchao Zhang 
188312ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
188412ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
188512ba2bc6SJunchao Zhang    */
188612ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
188712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
188812ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1889da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
189012ba2bc6SJunchao Zhang   } else {
189112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
189212ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
189312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
189412ba2bc6SJunchao Zhang   }
1895da112707SJunchao Zhang 
1896da112707SJunchao Zhang   /* ========================================================================== */
1897da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
1898da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
1899da112707SJunchao Zhang   /* ========================================================================== */
1900da112707SJunchao Zhang   int              structural_zero;
1901da112707SJunchao Zhang   cusparseStatus_t status;
1902da112707SJunchao Zhang 
1903da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
19049371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1905da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1906da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1907da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1908da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1909da112707SJunchao Zhang   }
1910da112707SJunchao Zhang 
1911da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
19120dd8c0acSJunchao Zhang   {
1913da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
19140dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
1915da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1916da112707SJunchao Zhang 
1917da112707SJunchao Zhang     Ai = Aseq->i;
1918da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1919da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
1920da112707SJunchao Zhang       if (nzRow > 1) {
1921da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1922da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1923da112707SJunchao Zhang         */
1924da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1925da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1926da112707SJunchao Zhang       }
1927da112707SJunchao Zhang     }
1928da112707SJunchao Zhang     fs->numericFactFlops = flops;
19290dd8c0acSJunchao Zhang   }
1930da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1931da112707SJunchao Zhang   PetscFunctionReturn(0);
1932da112707SJunchao Zhang }
1933da112707SJunchao Zhang #endif
1934da112707SJunchao Zhang 
19359371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1936da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1937da112707SJunchao Zhang 
1938da112707SJunchao Zhang   PetscFunctionBegin;
1939da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1940bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1941bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
1942da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
1943da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
1944bc996fdcSJunchao Zhang   }
1945da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
1946da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1947da112707SJunchao Zhang   } else
1948da112707SJunchao Zhang #endif
1949da112707SJunchao Zhang   {
1950da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1951da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1952da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1953da112707SJunchao Zhang   }
1954da112707SJunchao Zhang   PetscFunctionReturn(0);
1955da112707SJunchao Zhang }
1956da112707SJunchao Zhang 
19579371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1958da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1959da112707SJunchao Zhang 
1960da112707SJunchao Zhang   PetscFunctionBegin;
1961da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1962da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1963da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1964da112707SJunchao Zhang   PetscFunctionReturn(0);
1965da112707SJunchao Zhang }
1966da112707SJunchao Zhang 
19679371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1968da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1969da112707SJunchao Zhang 
1970da112707SJunchao Zhang   PetscFunctionBegin;
1971da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1972bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
1973bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1974da112707SJunchao Zhang   if (!info->levels && perm_identity) {
1975da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1976da112707SJunchao Zhang   } else
1977da112707SJunchao Zhang #endif
1978da112707SJunchao Zhang   {
1979da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1980da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1981da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1982da112707SJunchao Zhang   }
1983da112707SJunchao Zhang   PetscFunctionReturn(0);
1984da112707SJunchao Zhang }
1985da112707SJunchao Zhang 
19869371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1987da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1988da112707SJunchao Zhang 
1989da112707SJunchao Zhang   PetscFunctionBegin;
1990da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1991da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1992da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1993da112707SJunchao Zhang   PetscFunctionReturn(0);
1994da112707SJunchao Zhang }
1995da112707SJunchao Zhang 
19969371c9d4SSatish Balay PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) {
1997841d4cb1SJunchao Zhang   PetscFunctionBegin;
1998841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
1999841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2000841d4cb1SJunchao Zhang }
2001841d4cb1SJunchao Zhang 
2002841d4cb1SJunchao Zhang /*MC
2003841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2004841d4cb1SJunchao Zhang   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2005841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2006841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2007841d4cb1SJunchao Zhang   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2008841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2009841d4cb1SJunchao Zhang 
2010841d4cb1SJunchao Zhang   Level: beginner
2011841d4cb1SJunchao Zhang 
2012841d4cb1SJunchao Zhang .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2013841d4cb1SJunchao Zhang M*/
2014841d4cb1SJunchao Zhang 
20159371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) {
2016841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
2017bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
2018bc996fdcSJunchao Zhang   char     *prefix;
2019bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
2020841d4cb1SJunchao Zhang 
2021841d4cb1SJunchao Zhang   PetscFunctionBegin;
2022841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2023841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2024841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
2025841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2026841d4cb1SJunchao Zhang 
2027bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2028bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2029bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2030bc996fdcSJunchao Zhang   PetscOptionsEnd();
2031bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2032bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2033bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2034bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2035bc996fdcSJunchao Zhang 
2036841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2037841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2038841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2039841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2040841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2041841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2042841d4cb1SJunchao Zhang     } else {
2043841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2044841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2045841d4cb1SJunchao Zhang     }
2046841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2047841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2048841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2049841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2050841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2051841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2052841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2053841d4cb1SJunchao Zhang     } else {
2054841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2055841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2056841d4cb1SJunchao Zhang     }
2057841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2058841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2059841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2060841d4cb1SJunchao Zhang 
2061841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2062841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2063841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2064841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2065841d4cb1SJunchao Zhang }
2066841d4cb1SJunchao Zhang 
20679371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) {
20687e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
20697e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
20700dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500
2071da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
20720dd8c0acSJunchao Zhang #endif
20737e8381f9SStefano Zampini 
20747e8381f9SStefano Zampini   PetscFunctionBegin;
20757e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
20769566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2077da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2078da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
20799566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2080da112707SJunchao Zhang     }
2081da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500
2082da112707SJunchao Zhang     else if (fs->csrVal) {
2083da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2084da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2085da112707SJunchao Zhang     }
2086da112707SJunchao Zhang #endif
20879371c9d4SSatish Balay     else
20889371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
20899566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
20909566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
20917e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
20927e8381f9SStefano Zampini   }
20937e8381f9SStefano Zampini   PetscFunctionReturn(0);
20947e8381f9SStefano Zampini }
20957e8381f9SStefano Zampini 
20969371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
20977e8381f9SStefano Zampini   PetscFunctionBegin;
20989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
209967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
210067a45760SJunchao Zhang   PetscFunctionReturn(0);
210167a45760SJunchao Zhang }
210267a45760SJunchao Zhang 
21039371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
210467a45760SJunchao Zhang   PetscFunctionBegin;
21057e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
210667a45760SJunchao Zhang   *array         = NULL;
210767a45760SJunchao Zhang   PetscFunctionReturn(0);
210867a45760SJunchao Zhang }
210967a45760SJunchao Zhang 
21109371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
211167a45760SJunchao Zhang   PetscFunctionBegin;
21129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
211367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
211467a45760SJunchao Zhang   PetscFunctionReturn(0);
211567a45760SJunchao Zhang }
211667a45760SJunchao Zhang 
21179371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
211867a45760SJunchao Zhang   PetscFunctionBegin;
211967a45760SJunchao Zhang   *array = NULL;
212067a45760SJunchao Zhang   PetscFunctionReturn(0);
212167a45760SJunchao Zhang }
212267a45760SJunchao Zhang 
21239371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
212467a45760SJunchao Zhang   PetscFunctionBegin;
212567a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
212667a45760SJunchao Zhang   PetscFunctionReturn(0);
212767a45760SJunchao Zhang }
212867a45760SJunchao Zhang 
21299371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
213067a45760SJunchao Zhang   PetscFunctionBegin;
213167a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
213267a45760SJunchao Zhang   *array         = NULL;
21337e8381f9SStefano Zampini   PetscFunctionReturn(0);
21347e8381f9SStefano Zampini }
21357e8381f9SStefano Zampini 
21369371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
21377ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
21387ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
21397ee59b9bSJunchao Zhang 
21407ee59b9bSJunchao Zhang   PetscFunctionBegin;
21417ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
21427ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
21437ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
21447ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
21457ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
21467ee59b9bSJunchao Zhang 
21477ee59b9bSJunchao Zhang   if (i) {
21487ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21497ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
21507ee59b9bSJunchao Zhang #else
21517ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21527ee59b9bSJunchao Zhang #endif
21537ee59b9bSJunchao Zhang   }
21547ee59b9bSJunchao Zhang   if (j) {
21557ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21567ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
21577ee59b9bSJunchao Zhang #else
21587ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21597ee59b9bSJunchao Zhang #endif
21607ee59b9bSJunchao Zhang   }
21617ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
21627ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
21637ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
21647ee59b9bSJunchao Zhang }
21657ee59b9bSJunchao Zhang 
21669371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) {
2167aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
21687c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
21699ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2170213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2171aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2172abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
21739ae82921SPaul Mullowney 
21749ae82921SPaul Mullowney   PetscFunctionBegin;
217528b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2176c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2177a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2178a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2179afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
218085ba7357SStefano Zampini 
218108401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
21829566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2183afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
21849566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
21859566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
21869566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21879566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
218834d6c7a5SJose E. Roman     } else {
2189abb89eb1SStefano Zampini       PetscInt nnz;
21909566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21919566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
21929566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
21937c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
219481902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2195a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2196a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
21979ae82921SPaul Mullowney       try {
21989ae82921SPaul Mullowney         if (a->compressedrow.use) {
21999ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
22009ae82921SPaul Mullowney           ii   = a->compressedrow.i;
22019ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
22029ae82921SPaul Mullowney         } else {
2203213423ffSJunchao Zhang           m    = A->rmap->n;
2204213423ffSJunchao Zhang           ii   = a->i;
2205e6e9a74fSStefano Zampini           ridx = NULL;
22069ae82921SPaul Mullowney         }
220708401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
22089371c9d4SSatish Balay         if (!a->a) {
22099371c9d4SSatish Balay           nnz  = ii[m];
22109371c9d4SSatish Balay           both = PETSC_FALSE;
22119371c9d4SSatish Balay         } else nnz = a->nz;
221208401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
22139ae82921SPaul Mullowney 
221485ba7357SStefano Zampini         /* create cusparse matrix */
2215abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2216aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
22179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
22189566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
22199566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
22209ae82921SPaul Mullowney 
22219566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
22229566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
22239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
22249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2228b06137fdSPaul Mullowney 
2229aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2230aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2231aa372e3fSPaul Mullowney           /* set the matrix */
2232afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2233afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2234afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2235abb89eb1SStefano Zampini           mat->num_entries = nnz;
2236afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2237afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
22389ae82921SPaul Mullowney 
2239abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2240abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2241aa372e3fSPaul Mullowney 
2242abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2243abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2244aa372e3fSPaul Mullowney 
2245aa372e3fSPaul Mullowney           /* assign the pointer */
2246afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2247afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2248afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
22499371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
22509371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
22519371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2252afb2bd1cSJunchao Zhang           }
2253afb2bd1cSJunchao Zhang #endif
2254aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2255afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2256afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2257afb2bd1cSJunchao Zhang #else
2258afb2bd1cSJunchao Zhang           CsrMatrix *mat = new CsrMatrix;
2259afb2bd1cSJunchao Zhang           mat->num_rows = m;
2260afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2261abb89eb1SStefano Zampini           mat->num_entries = nnz;
2262afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2263afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2264aa372e3fSPaul Mullowney 
2265abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2266abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2267aa372e3fSPaul Mullowney 
2268abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2269abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2270aa372e3fSPaul Mullowney 
2271aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
22729566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
22739371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
22749371c9d4SSatish Balay           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
22759371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2276aa372e3fSPaul Mullowney           /* assign the pointer */
2277aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2278aa372e3fSPaul Mullowney 
2279afb2bd1cSJunchao Zhang           if (mat) {
2280afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2281afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2282afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2283afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2284087f3262SPaul Mullowney           }
2285afb2bd1cSJunchao Zhang #endif
2286087f3262SPaul Mullowney         }
2287ca45077fSPaul Mullowney 
2288aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2289213423ffSJunchao Zhang         if (a->compressedrow.use) {
2290213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2291aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2292aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2293213423ffSJunchao Zhang           tmp = m;
2294213423ffSJunchao Zhang         } else {
2295213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2296213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2297213423ffSJunchao Zhang           tmp                        = 0;
2298213423ffSJunchao Zhang         }
22999566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2300aa372e3fSPaul Mullowney 
2301aa372e3fSPaul Mullowney         /* assign the pointer */
2302aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
23039371c9d4SSatish Balay       } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
23049566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
23059566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
230634d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
230734d6c7a5SJose E. Roman     }
2308abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
23099ae82921SPaul Mullowney   }
23109ae82921SPaul Mullowney   PetscFunctionReturn(0);
23119ae82921SPaul Mullowney }
23129ae82921SPaul Mullowney 
23139371c9d4SSatish Balay struct VecCUDAPlusEquals {
2314aa372e3fSPaul Mullowney   template <typename Tuple>
23159371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
2316aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2317aa372e3fSPaul Mullowney   }
2318aa372e3fSPaul Mullowney };
2319aa372e3fSPaul Mullowney 
23209371c9d4SSatish Balay struct VecCUDAEquals {
23217e8381f9SStefano Zampini   template <typename Tuple>
23229371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
23237e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
23247e8381f9SStefano Zampini   }
23257e8381f9SStefano Zampini };
23267e8381f9SStefano Zampini 
23279371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2328e6e9a74fSStefano Zampini   template <typename Tuple>
23299371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
2330e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2331e6e9a74fSStefano Zampini   }
2332e6e9a74fSStefano Zampini };
2333e6e9a74fSStefano Zampini 
2334afb2bd1cSJunchao Zhang struct MatMatCusparse {
2335ccdfe979SStefano Zampini   PetscBool      cisdense;
2336ccdfe979SStefano Zampini   PetscScalar   *Bt;
2337ccdfe979SStefano Zampini   Mat            X;
2338fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2339fcdce8c4SStefano Zampini   PetscLogDouble flops;
2340fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2341b4285af6SJunchao Zhang 
2342afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2343fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2344afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2345afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2346afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2347afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2348b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2349b4285af6SJunchao Zhang   void *dBuffer4;
2350b4285af6SJunchao Zhang   void *dBuffer5;
2351b4285af6SJunchao Zhang #endif
2352fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2353fcdce8c4SStefano Zampini   void                 *mmBuffer;
2354fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2355fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2356afb2bd1cSJunchao Zhang #endif
2357afb2bd1cSJunchao Zhang };
2358ccdfe979SStefano Zampini 
23599371c9d4SSatish Balay static PetscErrorCode MatDestroy_MatMatCusparse(void *data) {
2360ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2361ccdfe979SStefano Zampini 
2362ccdfe979SStefano Zampini   PetscFunctionBegin;
23639566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2364fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2365afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
23669566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
23679566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
23689566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
23699566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2370b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
23719566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
23729566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2373b4285af6SJunchao Zhang #endif
23749566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
23759566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2376afb2bd1cSJunchao Zhang #endif
23779566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
23789566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2379ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2380ccdfe979SStefano Zampini }
2381ccdfe979SStefano Zampini 
2382ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2383ccdfe979SStefano Zampini 
23849371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2385ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2386ccdfe979SStefano Zampini   Mat                           A, B;
2387afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2388ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2389ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2390ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2391ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2392ccdfe979SStefano Zampini   const PetscScalar            *barray;
2393ccdfe979SStefano Zampini   PetscScalar                  *carray;
2394ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2395ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2396ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2397ccdfe979SStefano Zampini 
2398ccdfe979SStefano Zampini   PetscFunctionBegin;
2399ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
240028b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2401ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2402ccdfe979SStefano Zampini   A      = product->A;
2403ccdfe979SStefano Zampini   B      = product->B;
24049566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
240528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2406ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2407ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
240828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
24099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2410ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2411ccdfe979SStefano Zampini   switch (product->type) {
2412ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2413ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2414ccdfe979SStefano Zampini     mat = cusp->mat;
2415ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2416ccdfe979SStefano Zampini     m   = A->rmap->n;
2417ccdfe979SStefano Zampini     n   = B->cmap->n;
2418ccdfe979SStefano Zampini     break;
2419ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
24201a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2421e6e9a74fSStefano Zampini       mat = cusp->mat;
2422e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2423e6e9a74fSStefano Zampini     } else {
24249566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2425ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2426ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2427e6e9a74fSStefano Zampini     }
2428ccdfe979SStefano Zampini     m = A->cmap->n;
2429ccdfe979SStefano Zampini     n = B->cmap->n;
2430ccdfe979SStefano Zampini     break;
2431ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2432ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2433ccdfe979SStefano Zampini     mat = cusp->mat;
2434ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2435ccdfe979SStefano Zampini     m   = A->rmap->n;
2436ccdfe979SStefano Zampini     n   = B->rmap->n;
2437ccdfe979SStefano Zampini     break;
24389371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2439ccdfe979SStefano Zampini   }
244028b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2441ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2442ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
24439566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
24449566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
24459566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2446afb2bd1cSJunchao Zhang 
24479566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2448c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
24499566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
24509566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2451c8378d12SStefano Zampini   } else {
24529566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
24539566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2454c8378d12SStefano Zampini   }
2455c8378d12SStefano Zampini 
24569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2457afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2458afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2459a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2460afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2461fcdce8c4SStefano Zampini     size_t mmBufferSize;
24629371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
24639371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
24649371c9d4SSatish Balay       mmdata->matBDescr = NULL;
24659371c9d4SSatish Balay     }
2466afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
24679566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2468afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2469afb2bd1cSJunchao Zhang     }
2470c8378d12SStefano Zampini 
24719371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
24729371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
24739371c9d4SSatish Balay       mmdata->matCDescr = NULL;
24749371c9d4SSatish Balay     }
2475afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
24769566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2477afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2478afb2bd1cSJunchao Zhang     }
2479afb2bd1cSJunchao Zhang 
2480afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
24819371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
24829371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
24839371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2484afb2bd1cSJunchao Zhang     }
24859371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
24869371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2487fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
24889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
24899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2490fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2491fcdce8c4SStefano Zampini     }
2492afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2493afb2bd1cSJunchao Zhang   } else {
2494afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
24959566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
24969566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
24979566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2498afb2bd1cSJunchao Zhang   }
2499afb2bd1cSJunchao Zhang 
2500afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
25019371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
25029371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2503afb2bd1cSJunchao Zhang #else
2504afb2bd1cSJunchao Zhang   PetscInt k;
2505afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2506ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2507ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2508ccdfe979SStefano Zampini     cublasStatus_t cerr;
2509ccdfe979SStefano Zampini 
25109566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
25119371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
25129371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2513ccdfe979SStefano Zampini     blda = B->cmap->n;
2514afb2bd1cSJunchao Zhang     k = B->cmap->n;
2515afb2bd1cSJunchao Zhang   } else {
2516afb2bd1cSJunchao Zhang     k = B->rmap->n;
2517ccdfe979SStefano Zampini   }
2518ccdfe979SStefano Zampini 
2519afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
25209371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
25219371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2522afb2bd1cSJunchao Zhang #endif
25239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
25249566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
25259566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2526ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
25279566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25289566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2529ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
25309566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25319566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2532ccdfe979SStefano Zampini   } else {
25339566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2534ccdfe979SStefano Zampini   }
2535*48a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
2536*48a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2537ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2538ccdfe979SStefano Zampini }
2539ccdfe979SStefano Zampini 
25409371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2541ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2542ccdfe979SStefano Zampini   Mat                 A, B;
2543ccdfe979SStefano Zampini   PetscInt            m, n;
2544ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2545ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2546ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2547ccdfe979SStefano Zampini 
2548ccdfe979SStefano Zampini   PetscFunctionBegin;
2549ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
255028b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2551ccdfe979SStefano Zampini   A = product->A;
2552ccdfe979SStefano Zampini   B = product->B;
25539566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
255428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2555ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
255608401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2557ccdfe979SStefano Zampini   switch (product->type) {
2558ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2559ccdfe979SStefano Zampini     m = A->rmap->n;
2560ccdfe979SStefano Zampini     n = B->cmap->n;
2561ccdfe979SStefano Zampini     break;
2562ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2563ccdfe979SStefano Zampini     m = A->cmap->n;
2564ccdfe979SStefano Zampini     n = B->cmap->n;
2565ccdfe979SStefano Zampini     break;
2566ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2567ccdfe979SStefano Zampini     m = A->rmap->n;
2568ccdfe979SStefano Zampini     n = B->rmap->n;
2569ccdfe979SStefano Zampini     break;
2570ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2571ccdfe979SStefano Zampini     m = B->cmap->n;
2572ccdfe979SStefano Zampini     n = B->cmap->n;
2573ccdfe979SStefano Zampini     break;
2574ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2575ccdfe979SStefano Zampini     m = B->rmap->n;
2576ccdfe979SStefano Zampini     n = B->rmap->n;
2577ccdfe979SStefano Zampini     break;
25789371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2579ccdfe979SStefano Zampini   }
25809566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2581ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
25829566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
25839566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2584ccdfe979SStefano Zampini 
2585ccdfe979SStefano Zampini   /* product data */
25869566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2587ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2588afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2589afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2590*48a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2591afb2bd1cSJunchao Zhang #endif
2592ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2593ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
25949566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
25959566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2596ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
25979566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2598ccdfe979SStefano Zampini     } else {
25999566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2600ccdfe979SStefano Zampini     }
2601ccdfe979SStefano Zampini   }
2602ccdfe979SStefano Zampini   C->product->data    = mmdata;
2603ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2604ccdfe979SStefano Zampini 
2605ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2606ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2607ccdfe979SStefano Zampini }
2608ccdfe979SStefano Zampini 
26099371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2610ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2611fcdce8c4SStefano Zampini   Mat                           A, B;
2612fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2613fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2614fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2615fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2616fcdce8c4SStefano Zampini   PetscBool                     flg;
2617fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2618fcdce8c4SStefano Zampini   MatProductType                ptype;
2619fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2620fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2621fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2622fcdce8c4SStefano Zampini #endif
2623b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2624ccdfe979SStefano Zampini 
2625ccdfe979SStefano Zampini   PetscFunctionBegin;
2626ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
262728b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
26289566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
262928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2630fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2631fcdce8c4SStefano Zampini   A      = product->A;
2632fcdce8c4SStefano Zampini   B      = product->B;
2633fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2634fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2635fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
263608401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2637fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
263828b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2639fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
264028b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2641fcdce8c4SStefano Zampini     goto finalize;
2642fcdce8c4SStefano Zampini   }
2643fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
26449566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
264528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
26469566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
264728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
264828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
264928b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2650fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2651fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2652fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
265308401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
265408401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
265508401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
26569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
26579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2658fcdce8c4SStefano Zampini 
2659fcdce8c4SStefano Zampini   ptype = product->type;
2660b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2661fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
266228b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2663fa046f9fSJunchao Zhang   }
2664b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2665fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
266628b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2667fa046f9fSJunchao Zhang   }
2668fcdce8c4SStefano Zampini   switch (ptype) {
2669fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2670fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2671fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2672fcdce8c4SStefano Zampini     break;
2673fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2674fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2675fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2676fcdce8c4SStefano Zampini     break;
2677fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2678fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2679fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2680fcdce8c4SStefano Zampini     break;
26819371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2682fcdce8c4SStefano Zampini   }
2683fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
268428b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
268528b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
268628b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2687fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2688fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2689fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
269028b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
269128b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
269228b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
26939566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2694fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2695fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
26969566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2697b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26989371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
26999371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2700b4285af6SJunchao Zhang #else
27019371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
27029371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
27039371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
27049371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2705b4285af6SJunchao Zhang #endif
2706fcdce8c4SStefano Zampini #else
27079371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
27089371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
27099371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2710fcdce8c4SStefano Zampini #endif
27119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
27129566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
27139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2714fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2715fcdce8c4SStefano Zampini finalize:
2716fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
27179566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
27189566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
27199566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2720fcdce8c4SStefano Zampini   c->reallocs = 0;
2721fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2722fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2723fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2724fcdce8c4SStefano Zampini   C->num_ass++;
2725ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2726ccdfe979SStefano Zampini }
2727fcdce8c4SStefano Zampini 
27289371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2729fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2730fcdce8c4SStefano Zampini   Mat                           A, B;
2731fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2732fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2733fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2734fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2735fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
2736fcdce8c4SStefano Zampini   PetscBool                     flg;
2737fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2738fcdce8c4SStefano Zampini   MatProductType                ptype;
2739fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2740fcdce8c4SStefano Zampini   PetscLogDouble                flops;
2741fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
2742fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2743fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2744fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2745fcdce8c4SStefano Zampini #else
2746fcdce8c4SStefano Zampini   int cnz;
2747fcdce8c4SStefano Zampini #endif
2748b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2749fcdce8c4SStefano Zampini 
2750fcdce8c4SStefano Zampini   PetscFunctionBegin;
2751fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
275228b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2753fcdce8c4SStefano Zampini   A = product->A;
2754fcdce8c4SStefano Zampini   B = product->B;
27559566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
275628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
27579566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
275828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2759fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
2760fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
2761fcdce8c4SStefano Zampini   /* product data */
27629566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2763fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2764fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2765fcdce8c4SStefano Zampini 
27669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
27679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2768d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2769d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
277008401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
277108401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2772d60bce21SJunchao Zhang 
2773fcdce8c4SStefano Zampini   ptype = product->type;
2774b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2775fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2776fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2777fa046f9fSJunchao Zhang   }
2778b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2779fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2780fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2781fa046f9fSJunchao Zhang   }
2782fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2783fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2784fcdce8c4SStefano Zampini   switch (ptype) {
2785fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2786fcdce8c4SStefano Zampini     m    = A->rmap->n;
2787fcdce8c4SStefano Zampini     n    = B->cmap->n;
2788fcdce8c4SStefano Zampini     k    = A->cmap->n;
2789fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2790fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2791fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2792fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2793fcdce8c4SStefano Zampini     break;
2794fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2795fcdce8c4SStefano Zampini     m = A->cmap->n;
2796fcdce8c4SStefano Zampini     n = B->cmap->n;
2797fcdce8c4SStefano Zampini     k = A->rmap->n;
27989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2799fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2800fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2801fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2802fcdce8c4SStefano Zampini     break;
2803fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2804fcdce8c4SStefano Zampini     m = A->rmap->n;
2805fcdce8c4SStefano Zampini     n = B->rmap->n;
2806fcdce8c4SStefano Zampini     k = A->cmap->n;
28079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2808fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2809fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2810fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2811fcdce8c4SStefano Zampini     break;
28129371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2813fcdce8c4SStefano Zampini   }
2814fcdce8c4SStefano Zampini 
2815fcdce8c4SStefano Zampini   /* create cusparse matrix */
28169566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
28179566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2818fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
2819fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2820fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2821fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2822fcdce8c4SStefano Zampini 
2823fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2824fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2825fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
28269566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
28279566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2828fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2829fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2830fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2831fcdce8c4SStefano Zampini   } else {
2832fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2833fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2834fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2835fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2836fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2837fcdce8c4SStefano Zampini   }
2838fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2839fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
2840fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
2841fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2842fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2843fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
28449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
28459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
28469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
28479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
28489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
28499566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
28509566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28519566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28529566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2853fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2854fcdce8c4SStefano Zampini     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2855fcdce8c4SStefano Zampini     c->nz                = 0;
2856fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2857fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
2858fcdce8c4SStefano Zampini     goto finalizesym;
2859fcdce8c4SStefano Zampini   }
2860fcdce8c4SStefano Zampini 
286128b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
286228b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2863fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2864fcdce8c4SStefano Zampini   if (!biscompressed) {
2865fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
2866fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2867fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2868fcdce8c4SStefano Zampini #endif
2869fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2870fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2871fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
2872fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2873fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2874fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2875fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2876fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2877fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2878fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2879fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
28809566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2881fcdce8c4SStefano Zampini     }
2882fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2883fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
2884fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2885fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
28869371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28879371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2888fcdce8c4SStefano Zampini     }
2889fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2890fcdce8c4SStefano Zampini #endif
2891fcdce8c4SStefano Zampini   }
289228b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
289328b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2894fcdce8c4SStefano Zampini   /* precompute flops count */
2895fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2896fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2897fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2898fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
2899fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
2900fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2901fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2902fcdce8c4SStefano Zampini       }
2903fcdce8c4SStefano Zampini     }
2904fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2905fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2906fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
2907fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2908fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
2909fcdce8c4SStefano Zampini     }
2910fcdce8c4SStefano Zampini   } else { /* TODO */
2911fcdce8c4SStefano Zampini     flops = 0.;
2912fcdce8c4SStefano Zampini   }
2913fcdce8c4SStefano Zampini 
2914fcdce8c4SStefano Zampini   mmdata->flops = flops;
29159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2916b4285af6SJunchao Zhang 
2917fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
29199371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
29209371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29219566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2922b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2923b4285af6SJunchao Zhang   {
2924b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2925b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2926b4285af6SJunchao Zhang   */
2927b4285af6SJunchao Zhang     void  *dBuffer1    = NULL;
2928b4285af6SJunchao Zhang     void  *dBuffer2    = NULL;
2929b4285af6SJunchao Zhang     void  *dBuffer3    = NULL;
2930b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2931b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
2932b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
2933b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
2934b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
2935b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
2936b4285af6SJunchao Zhang 
2937b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2938b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
29399371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
29409371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2942b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
29439371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
29449371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2945b4285af6SJunchao Zhang 
2946b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29479371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
29489371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
29509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
29519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
29529371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
29539371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
29559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
2956b4285af6SJunchao Zhang 
2957b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2958b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
29599566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2960b4285af6SJunchao Zhang     c->nz                = (PetscInt)C_nnz1;
2961b4285af6SJunchao Zhang     /* allocate matrix C */
29629371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29639371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29649371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
29659371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2966b4285af6SJunchao Zhang     /* update matC with the new pointers */
29679371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29689371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2969b4285af6SJunchao Zhang 
2970b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29719371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
29729371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
29749371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
29759371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29769566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
29779371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29789371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29799566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2980b4285af6SJunchao Zhang   }
2981ae37ee31SJunchao Zhang #else
2982b4285af6SJunchao Zhang   size_t bufSize2;
2983fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
29849371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
29859371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29869566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2987fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
29889371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
29899371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2990fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
29919371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
29929371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2993fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2994fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2995fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2996fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2997fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
29989566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2999fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
30009371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
30019371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3002fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
30039566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3004fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
30059371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
30069371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3007fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30089566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3009fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30109566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
30119371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
30129371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
30139371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30149371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3015ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3016fcdce8c4SStefano Zampini #else
30179566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
30189371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30199371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
30209371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3021fcdce8c4SStefano Zampini   c->nz = cnz;
3022fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30239566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3024fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30259566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3026fcdce8c4SStefano Zampini 
30279566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3028fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3029fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3030fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
30319371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30329371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30339371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3034fcdce8c4SStefano Zampini #endif
30359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3037fcdce8c4SStefano Zampini finalizesym:
3038fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3039fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3040fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
30419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
30429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3043fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3044fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3045fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3046fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3047fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3048fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3049fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3052fcdce8c4SStefano Zampini   } else {
3053fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3054fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3057fcdce8c4SStefano Zampini   }
3058fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3059fcdce8c4SStefano Zampini     PetscInt r = 0;
3060fcdce8c4SStefano Zampini     c->i[0]    = 0;
3061fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3062fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3063fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3064fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3065fcdce8c4SStefano Zampini     }
3066fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3067fcdce8c4SStefano Zampini   }
30689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
30699566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
30709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3071fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3072fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3073fcdce8c4SStefano Zampini   c->rmax          = 0;
3074fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3075fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3076fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3077fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3078fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3079fcdce8c4SStefano Zampini   }
30809566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
30819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3082fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3083fcdce8c4SStefano Zampini 
3084fcdce8c4SStefano Zampini   C->nonzerostate++;
30859566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
30869566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3087fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3088fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3089fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3090fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3091fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3092abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3093fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3094fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3095fcdce8c4SStefano Zampini   }
3096fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3097fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
3098fcdce8c4SStefano Zampini }
3099fcdce8c4SStefano Zampini 
3100fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3101fcdce8c4SStefano Zampini 
3102fcdce8c4SStefano Zampini /* handles sparse or dense B */
31039371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) {
3104fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3105fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3106fcdce8c4SStefano Zampini 
3107fcdce8c4SStefano Zampini   PetscFunctionBegin;
3108fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
31099566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
3110*48a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3111fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3112fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
3113*48a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3114fcdce8c4SStefano Zampini   }
311565e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
311665e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
311765e4b4d4SStefano Zampini     switch (product->type) {
311865e4b4d4SStefano Zampini     case MATPRODUCT_AB:
311965e4b4d4SStefano Zampini       if (product->api_user) {
3120d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
31219566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3122d0609cedSBarry Smith         PetscOptionsEnd();
312365e4b4d4SStefano Zampini       } else {
3124d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
31259566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3126d0609cedSBarry Smith         PetscOptionsEnd();
312765e4b4d4SStefano Zampini       }
312865e4b4d4SStefano Zampini       break;
312965e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
313065e4b4d4SStefano Zampini       if (product->api_user) {
3131d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
31329566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3133d0609cedSBarry Smith         PetscOptionsEnd();
313465e4b4d4SStefano Zampini       } else {
3135d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
31369566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3137d0609cedSBarry Smith         PetscOptionsEnd();
313865e4b4d4SStefano Zampini       }
313965e4b4d4SStefano Zampini       break;
314065e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
314165e4b4d4SStefano Zampini       if (product->api_user) {
3142d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
31439566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3144d0609cedSBarry Smith         PetscOptionsEnd();
314565e4b4d4SStefano Zampini       } else {
3146d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
31479566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3148d0609cedSBarry Smith         PetscOptionsEnd();
314965e4b4d4SStefano Zampini       }
315065e4b4d4SStefano Zampini       break;
315165e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
315265e4b4d4SStefano Zampini       if (product->api_user) {
3153d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
31549566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3155d0609cedSBarry Smith         PetscOptionsEnd();
315665e4b4d4SStefano Zampini       } else {
3157d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
31589566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3159d0609cedSBarry Smith         PetscOptionsEnd();
316065e4b4d4SStefano Zampini       }
316165e4b4d4SStefano Zampini       break;
316265e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
316365e4b4d4SStefano Zampini       if (product->api_user) {
3164d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
31659566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3166d0609cedSBarry Smith         PetscOptionsEnd();
316765e4b4d4SStefano Zampini       } else {
3168d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
31699566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3170d0609cedSBarry Smith         PetscOptionsEnd();
317165e4b4d4SStefano Zampini       }
317265e4b4d4SStefano Zampini       break;
31739371c9d4SSatish Balay     default: break;
317465e4b4d4SStefano Zampini     }
317565e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
317665e4b4d4SStefano Zampini   }
317765e4b4d4SStefano Zampini   /* dispatch */
3178fcdce8c4SStefano Zampini   if (isdense) {
3179ccdfe979SStefano Zampini     switch (product->type) {
3180ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3181ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3182ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3183ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3184ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3185fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
31869566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3187fcdce8c4SStefano Zampini       } else {
3188fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3189fcdce8c4SStefano Zampini       }
3190fcdce8c4SStefano Zampini       break;
31919371c9d4SSatish Balay     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
31929371c9d4SSatish Balay     default: break;
3193ccdfe979SStefano Zampini     }
3194fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3195fcdce8c4SStefano Zampini     switch (product->type) {
3196fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3197fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
31989371c9d4SSatish Balay     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break;
3199fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3200fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
32019371c9d4SSatish Balay     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
32029371c9d4SSatish Balay     default: break;
3203fcdce8c4SStefano Zampini     }
3204fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
32059566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3206fcdce8c4SStefano Zampini   }
3207ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3208ccdfe979SStefano Zampini }
3209ccdfe979SStefano Zampini 
32109371c9d4SSatish Balay static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
32119ae82921SPaul Mullowney   PetscFunctionBegin;
32129566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3213e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3214e6e9a74fSStefano Zampini }
3215e6e9a74fSStefano Zampini 
32169371c9d4SSatish Balay static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3217e6e9a74fSStefano Zampini   PetscFunctionBegin;
32189566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3219e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3220e6e9a74fSStefano Zampini }
3221e6e9a74fSStefano Zampini 
32229371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3223e6e9a74fSStefano Zampini   PetscFunctionBegin;
32249566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3225e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3226e6e9a74fSStefano Zampini }
3227e6e9a74fSStefano Zampini 
32289371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3229e6e9a74fSStefano Zampini   PetscFunctionBegin;
32309566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
32319ae82921SPaul Mullowney   PetscFunctionReturn(0);
32329ae82921SPaul Mullowney }
32339ae82921SPaul Mullowney 
32349371c9d4SSatish Balay static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3235ca45077fSPaul Mullowney   PetscFunctionBegin;
32369566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3237ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3238ca45077fSPaul Mullowney }
3239ca45077fSPaul Mullowney 
32409371c9d4SSatish Balay __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) {
3241a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3242a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3243a0e72f99SJunchao Zhang }
3244a0e72f99SJunchao Zhang 
3245afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
32469371c9d4SSatish Balay static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) {
32479ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3248aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
32499ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3250e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3251e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3252e6e9a74fSStefano Zampini   PetscBool                     compressed;
3253afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3254afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3255afb2bd1cSJunchao Zhang #endif
32566e111a19SKarl Rupp 
32579ae82921SPaul Mullowney   PetscFunctionBegin;
325808401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3259cbc6b225SStefano Zampini   if (!a->nz) {
32609566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
32619566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3262e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3263e6e9a74fSStefano Zampini   }
326434d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
32659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3266e6e9a74fSStefano Zampini   if (!trans) {
32679ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
32685f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3269e6e9a74fSStefano Zampini   } else {
32701a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3271e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3272e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3273e6e9a74fSStefano Zampini     } else {
32749566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3275e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3276e6e9a74fSStefano Zampini     }
3277e6e9a74fSStefano Zampini   }
3278e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3279e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3280213423ffSJunchao Zhang 
3281e6e9a74fSStefano Zampini   try {
32829566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
32839566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
32849566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3285afb2bd1cSJunchao Zhang 
32869566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3287e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3288afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3289afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3290afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3291afb2bd1cSJunchao Zhang       */
3292e6e9a74fSStefano Zampini       xptr = xarray;
3293afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3294213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3295afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3296afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3297afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3298afb2bd1cSJunchao Zhang        */
3299afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3300afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3301afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3302afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3303afb2bd1cSJunchao Zhang       }
3304afb2bd1cSJunchao Zhang #endif
3305e6e9a74fSStefano Zampini     } else {
3306afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3307afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3308afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3309afb2bd1cSJunchao Zhang        */
3310afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3311e6e9a74fSStefano Zampini       dptr = zarray;
3312e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3313afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3314e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3315a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
33169371c9d4SSatish Balay                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3317e6e9a74fSStefano Zampini       }
3318afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3319afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3320afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3321afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3322afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3323afb2bd1cSJunchao Zhang       }
3324afb2bd1cSJunchao Zhang #endif
3325e6e9a74fSStefano Zampini     }
33269ae82921SPaul Mullowney 
3327afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3328aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3329afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
33305f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3331afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
33329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
33339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
33349371c9d4SSatish Balay         PetscCallCUSPARSE(
33359371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
33369566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3337afb2bd1cSJunchao Zhang 
3338afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3339afb2bd1cSJunchao Zhang       } else {
3340afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
33419566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
33429566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3343afb2bd1cSJunchao Zhang       }
3344afb2bd1cSJunchao Zhang 
33459371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
33469371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3347afb2bd1cSJunchao Zhang #else
33487656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
33499371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3350afb2bd1cSJunchao Zhang #endif
3351aa372e3fSPaul Mullowney     } else {
3352213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3353afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3354afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3355afb2bd1cSJunchao Zhang #else
3356301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
33579371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3358afb2bd1cSJunchao Zhang #endif
3359a65300a6SPaul Mullowney       }
3360aa372e3fSPaul Mullowney     }
33619566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3362aa372e3fSPaul Mullowney 
3363e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3364213423ffSJunchao Zhang       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3365213423ffSJunchao Zhang         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
33669566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3367e6e9a74fSStefano Zampini         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
33689566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
33697656d835SStefano Zampini         }
3370213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
33719566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz, 0));
33727656d835SStefano Zampini       }
33737656d835SStefano Zampini 
3374213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3375213423ffSJunchao Zhang       if (compressed) {
33769566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3377a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3378a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3379a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3380a0e72f99SJunchao Zhang          */
3381a0e72f99SJunchao Zhang #if 0
3382a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3383a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3384a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3385e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3386c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3387a0e72f99SJunchao Zhang #else
3388a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3389a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3390a0e72f99SJunchao Zhang #endif
33919566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3392e6e9a74fSStefano Zampini       }
3393e6e9a74fSStefano Zampini     } else {
33949371c9d4SSatish Balay       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3395e6e9a74fSStefano Zampini     }
33969566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
33979566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
33989566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
33999371c9d4SSatish Balay   } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3400e6e9a74fSStefano Zampini   if (yy) {
34019566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3402e6e9a74fSStefano Zampini   } else {
34039566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3404e6e9a74fSStefano Zampini   }
34059ae82921SPaul Mullowney   PetscFunctionReturn(0);
34069ae82921SPaul Mullowney }
34079ae82921SPaul Mullowney 
34089371c9d4SSatish Balay static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3409ca45077fSPaul Mullowney   PetscFunctionBegin;
34109566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3411ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3412ca45077fSPaul Mullowney }
3413ca45077fSPaul Mullowney 
34149371c9d4SSatish Balay static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) {
3415042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3416042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
34173fa6b06aSMark Adams 
3418042217e8SBarry Smith   PetscFunctionBegin;
34199566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3420042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
34219566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
34229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3423042217e8SBarry Smith     cusp->deviceMat = NULL;
3424042217e8SBarry Smith   }
34259ae82921SPaul Mullowney   PetscFunctionReturn(0);
34269ae82921SPaul Mullowney }
34279ae82921SPaul Mullowney 
34289ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3429e057df02SPaul Mullowney /*@
34309ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3431e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3432e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3433e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3434e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3435e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
34369ae82921SPaul Mullowney 
3437d083f849SBarry Smith    Collective
34389ae82921SPaul Mullowney 
34399ae82921SPaul Mullowney    Input Parameters:
34409ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
34419ae82921SPaul Mullowney .  m - number of rows
34429ae82921SPaul Mullowney .  n - number of columns
34439ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
34449ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
34450298fd71SBarry Smith          (possibly different for each row) or NULL
34469ae82921SPaul Mullowney 
34479ae82921SPaul Mullowney    Output Parameter:
34489ae82921SPaul Mullowney .  A - the matrix
34499ae82921SPaul Mullowney 
34509ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
34519ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
34529ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
34539ae82921SPaul Mullowney 
34549ae82921SPaul Mullowney    Notes:
34559ae82921SPaul Mullowney    If nnz is given then nz is ignored
34569ae82921SPaul Mullowney 
34579ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
34589ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
34599ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
34609ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
34619ae82921SPaul Mullowney 
34629ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
34630298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
34649ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
34659ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
34669ae82921SPaul Mullowney 
34679ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
34689ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
34699ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
34709ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
34719ae82921SPaul Mullowney 
34729ae82921SPaul Mullowney    Level: intermediate
34739ae82921SPaul Mullowney 
3474db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
34759ae82921SPaul Mullowney @*/
34769371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
34779ae82921SPaul Mullowney   PetscFunctionBegin;
34789566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
34799566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
34809566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
34819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
34829ae82921SPaul Mullowney   PetscFunctionReturn(0);
34839ae82921SPaul Mullowney }
34849ae82921SPaul Mullowney 
34859371c9d4SSatish Balay static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) {
34869ae82921SPaul Mullowney   PetscFunctionBegin;
34879ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
34889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
34899ae82921SPaul Mullowney   } else {
34909566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3491aa372e3fSPaul Mullowney   }
34929566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
34939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
34949566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
34959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
34969566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
34979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
34989566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
34999566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
35009566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
35019566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
35029566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
35039ae82921SPaul Mullowney   PetscFunctionReturn(0);
35049ae82921SPaul Mullowney }
35059ae82921SPaul Mullowney 
3506ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
350795639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
35089371c9d4SSatish Balay static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) {
35099ff858a8SKarl Rupp         PetscFunctionBegin;
35109566063dSJacob Faibussowitsch         PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
35119566063dSJacob Faibussowitsch         PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
35129ff858a8SKarl Rupp         PetscFunctionReturn(0);
35139ff858a8SKarl Rupp }
35149ff858a8SKarl Rupp 
35159371c9d4SSatish Balay static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) {
3516a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3517039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3518039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3519039c6fbaSStefano Zampini   PetscScalar        *ay;
3520039c6fbaSStefano Zampini   const PetscScalar  *ax;
3521039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3522e6e9a74fSStefano Zampini 
352395639643SRichard Tran Mills   PetscFunctionBegin;
3524a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3525a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3526039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
35279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35289566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3529a587d139SMark     PetscFunctionReturn(0);
353095639643SRichard Tran Mills   }
3531039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
35329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
35339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
35345f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
35355f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3536039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3537039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3538039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3539039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3540039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
35419371c9d4SSatish Balay     if (eq) { eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); }
3542039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3543039c6fbaSStefano Zampini   }
3544d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3545d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3546039c6fbaSStefano Zampini 
3547039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3548039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3549039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3550039c6fbaSStefano Zampini     size_t bufferSize;
3551039c6fbaSStefano Zampini     void  *buffer;
3552039c6fbaSStefano Zampini #endif
3553039c6fbaSStefano Zampini 
35549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35559566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3557039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
35589371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35599371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
35609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
35619566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35629371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35639371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
35649566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35659566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3567039c6fbaSStefano Zampini #else
35689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35699371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35709371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
35719566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35729566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3573039c6fbaSStefano Zampini #endif
35749566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
35759566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35769566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35779566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3578039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3579a587d139SMark     cublasHandle_t cublasv2handle;
3580a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3581039c6fbaSStefano Zampini 
35829566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35839566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35849566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
35859566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
35869566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35879566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
35889566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
35899566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35909566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35929566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3593039c6fbaSStefano Zampini   } else {
35949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35959566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3596a587d139SMark   }
359795639643SRichard Tran Mills   PetscFunctionReturn(0);
359895639643SRichard Tran Mills }
359995639643SRichard Tran Mills 
36009371c9d4SSatish Balay static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) {
360133c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
360233c9ba73SStefano Zampini   PetscScalar   *ay;
360333c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
360433c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
360533c9ba73SStefano Zampini 
360633c9ba73SStefano Zampini   PetscFunctionBegin;
36079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
36089566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
36099566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
36109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
36119566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
36129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
36139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
36149566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
361633c9ba73SStefano Zampini   PetscFunctionReturn(0);
361733c9ba73SStefano Zampini }
361833c9ba73SStefano Zampini 
36199371c9d4SSatish Balay static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) {
36207e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3621a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
36227e8381f9SStefano Zampini 
36233fa6b06aSMark Adams   PetscFunctionBegin;
36243fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
36253fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
36267e8381f9SStefano Zampini     if (spptr->mat) {
36277e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
36287e8381f9SStefano Zampini       if (matrix->values) {
36297e8381f9SStefano Zampini         both = PETSC_TRUE;
36307e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36317e8381f9SStefano Zampini       }
36327e8381f9SStefano Zampini     }
36337e8381f9SStefano Zampini     if (spptr->matTranspose) {
36347e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
36359371c9d4SSatish Balay       if (matrix->values) { thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); }
36367e8381f9SStefano Zampini     }
36373fa6b06aSMark Adams   }
36389566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
36399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
36407e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3641a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
36423fa6b06aSMark Adams   PetscFunctionReturn(0);
36433fa6b06aSMark Adams }
36443fa6b06aSMark Adams 
36459371c9d4SSatish Balay static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) {
3646a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3647a587d139SMark 
3648a587d139SMark   PetscFunctionBegin;
36499a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
36509a14fc28SStefano Zampini     A->boundtocpu = flg;
36519a14fc28SStefano Zampini     PetscFunctionReturn(0);
36529a14fc28SStefano Zampini   }
3653a587d139SMark   if (flg) {
36549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3655a587d139SMark 
365633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3657a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3658a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3659a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3660a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3661a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3662a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3663a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3664a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3665fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
36669566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
36679566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
36689566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
36699566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
36709566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
36719566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
36729566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3673a587d139SMark   } else {
367433c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3675a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3676a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3677a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3678a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3679a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3680a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3681a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3682a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3683fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
368467a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
368567a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
368667a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
368767a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
368867a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
368967a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
36907ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
36917ee59b9bSJunchao Zhang 
36929566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
36939566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36949566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36959566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
36969566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
36979566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3698a587d139SMark   }
3699a587d139SMark   A->boundtocpu = flg;
3700ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3701ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3702ea500dcfSRichard Tran Mills   } else {
3703ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3704ea500dcfSRichard Tran Mills   }
3705a587d139SMark   PetscFunctionReturn(0);
3706a587d139SMark }
3707a587d139SMark 
37089371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
370949735bf3SStefano Zampini   Mat B;
37109ae82921SPaul Mullowney 
37119ae82921SPaul Mullowney   PetscFunctionBegin;
37129566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
371349735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
37149566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
371549735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
37169566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
371749735bf3SStefano Zampini   }
371849735bf3SStefano Zampini   B = *newmat;
371949735bf3SStefano Zampini 
37209566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
37219566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
372234136279SStefano Zampini 
372349735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
37249ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3725e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
37269566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37279566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37289566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
37291a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
3730d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3731ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
3732a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3733a435da06SStefano Zampini #else
3734d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3735a435da06SStefano Zampini #endif
3736d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3737d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3738d8132acaSStefano Zampini #endif
37391a2c6b5cSJunchao Zhang       B->spptr = spptr;
37409ae82921SPaul Mullowney     } else {
3741e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3742e6e9a74fSStefano Zampini 
37439566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37449566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37459566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3746e6e9a74fSStefano Zampini       B->spptr = spptr;
37479ae82921SPaul Mullowney     }
3748e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
374949735bf3SStefano Zampini   }
3750693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
37519ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
37521a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
37539ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
375495639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3755693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
37562205254eSKarl Rupp 
37579566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
37589566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
37599566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3760ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
37619566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3762ae48a8d0SStefano Zampini #endif
37639566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
37649ae82921SPaul Mullowney   PetscFunctionReturn(0);
37659ae82921SPaul Mullowney }
37669ae82921SPaul Mullowney 
37679371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) {
376802fe1965SBarry Smith   PetscFunctionBegin;
37699566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
37709566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
377102fe1965SBarry Smith   PetscFunctionReturn(0);
377202fe1965SBarry Smith }
377302fe1965SBarry Smith 
37743ca39a21SBarry Smith /*MC
3775e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3776e057df02SPaul Mullowney 
3777e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
37782692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
37792692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3780e057df02SPaul Mullowney 
3781e057df02SPaul Mullowney    Options Database Keys:
3782e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3783aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3784a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3785365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3786e057df02SPaul Mullowney 
3787e057df02SPaul Mullowney   Level: beginner
3788e057df02SPaul Mullowney 
3789db781477SPatrick Sanan .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3790e057df02SPaul Mullowney M*/
37917f756511SDominic Meiser 
3792bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
37930f39cd5aSBarry Smith 
37949371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) {
379542c9c57cSBarry Smith   PetscFunctionBegin;
37969566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
37979566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
37989566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
37999566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
38009566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3801bddcd29dSMark Adams 
380242c9c57cSBarry Smith   PetscFunctionReturn(0);
380342c9c57cSBarry Smith }
380429b38603SBarry Smith 
38059371c9d4SSatish Balay static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) {
3806cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3807cbc6b225SStefano Zampini 
3808cbc6b225SStefano Zampini   PetscFunctionBegin;
3809cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3810cbc6b225SStefano Zampini   delete cusp->cooPerm;
3811cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3812cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
3813cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3814cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
38159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
38169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3817cbc6b225SStefano Zampini   }
3818cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3819cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3820cbc6b225SStefano Zampini }
3821cbc6b225SStefano Zampini 
38229371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) {
38237f756511SDominic Meiser   PetscFunctionBegin;
38247f756511SDominic Meiser   if (*cusparsestruct) {
38259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
38269566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
38277f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
382881902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
38297e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
38307e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3831a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
38329566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
38339566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
38349566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
38359566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
38367f756511SDominic Meiser   }
38377f756511SDominic Meiser   PetscFunctionReturn(0);
38387f756511SDominic Meiser }
38397f756511SDominic Meiser 
38409371c9d4SSatish Balay static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) {
38417f756511SDominic Meiser   PetscFunctionBegin;
38427f756511SDominic Meiser   if (*mat) {
38437f756511SDominic Meiser     delete (*mat)->values;
38447f756511SDominic Meiser     delete (*mat)->column_indices;
38457f756511SDominic Meiser     delete (*mat)->row_offsets;
38467f756511SDominic Meiser     delete *mat;
38477f756511SDominic Meiser     *mat = 0;
38487f756511SDominic Meiser   }
38497f756511SDominic Meiser   PetscFunctionReturn(0);
38507f756511SDominic Meiser }
38517f756511SDominic Meiser 
38529371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) {
38537f756511SDominic Meiser   PetscFunctionBegin;
38547f756511SDominic Meiser   if (*trifactor) {
38559566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3856261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
38579566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
38589566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
38599566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3860afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38619566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3862afb2bd1cSJunchao Zhang #endif
38639566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
38647f756511SDominic Meiser   }
38657f756511SDominic Meiser   PetscFunctionReturn(0);
38667f756511SDominic Meiser }
38677f756511SDominic Meiser 
38689371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) {
38697f756511SDominic Meiser   CsrMatrix *mat;
38707f756511SDominic Meiser 
38717f756511SDominic Meiser   PetscFunctionBegin;
38727f756511SDominic Meiser   if (*matstruct) {
38737f756511SDominic Meiser     if ((*matstruct)->mat) {
38747f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3875afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3876afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3877afb2bd1cSJunchao Zhang #else
38787f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
38799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3880afb2bd1cSJunchao Zhang #endif
38817f756511SDominic Meiser       } else {
38827f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
38837f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
38847f756511SDominic Meiser       }
38857f756511SDominic Meiser     }
38869566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
38877f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
38889566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
38899566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
38909566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3891afb2bd1cSJunchao Zhang 
3892afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3893afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
38949566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3895afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
3896afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
38979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
38989566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
38999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3900afb2bd1cSJunchao Zhang       }
3901afb2bd1cSJunchao Zhang     }
3902afb2bd1cSJunchao Zhang #endif
39037f756511SDominic Meiser     delete *matstruct;
39047e8381f9SStefano Zampini     *matstruct = NULL;
39057f756511SDominic Meiser   }
39067f756511SDominic Meiser   PetscFunctionReturn(0);
39077f756511SDominic Meiser }
39087f756511SDominic Meiser 
39099371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) {
3910da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3911da112707SJunchao Zhang 
39127f756511SDominic Meiser   PetscFunctionBegin;
3913da112707SJunchao Zhang   if (fs) {
3914da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3915da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3916da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3917da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3918da112707SJunchao Zhang     delete fs->rpermIndices;
3919da112707SJunchao Zhang     delete fs->cpermIndices;
3920da112707SJunchao Zhang     delete fs->workVector;
3921da112707SJunchao Zhang     fs->rpermIndices = NULL;
3922da112707SJunchao Zhang     fs->cpermIndices = NULL;
3923da112707SJunchao Zhang     fs->workVector   = NULL;
3924da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3925da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3926da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
3927da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
3928da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3929da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
3930da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
3931da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
3932da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
393312ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3934da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3935da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
393612ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3937da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3938da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3939da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3940da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3941da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3942da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3943da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3944da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3945da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3946da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3947da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3948da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
394912ba2bc6SJunchao Zhang 
395012ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
395112ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3952da112707SJunchao Zhang #endif
3953ccdfe979SStefano Zampini   }
3954ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3955ccdfe979SStefano Zampini }
3956ccdfe979SStefano Zampini 
39579371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) {
3958ccdfe979SStefano Zampini   cusparseHandle_t handle;
3959ccdfe979SStefano Zampini 
3960ccdfe979SStefano Zampini   PetscFunctionBegin;
3961ccdfe979SStefano Zampini   if (*trifactors) {
39629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
3963*48a46eb9SPierre Jolivet     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
39649566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
39657f756511SDominic Meiser   }
39667f756511SDominic Meiser   PetscFunctionReturn(0);
39677f756511SDominic Meiser }
39687e8381f9SStefano Zampini 
39699371c9d4SSatish Balay struct IJCompare {
39709371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
39717e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
39727e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
39737e8381f9SStefano Zampini     return false;
39747e8381f9SStefano Zampini   }
39757e8381f9SStefano Zampini };
39767e8381f9SStefano Zampini 
39779371c9d4SSatish Balay struct IJEqual {
39789371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
39797e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
39807e8381f9SStefano Zampini     return true;
39817e8381f9SStefano Zampini   }
39827e8381f9SStefano Zampini };
39837e8381f9SStefano Zampini 
39849371c9d4SSatish Balay struct IJDiff {
39859371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
39867e8381f9SStefano Zampini };
39877e8381f9SStefano Zampini 
39889371c9d4SSatish Balay struct IJSum {
39899371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
39907e8381f9SStefano Zampini };
39917e8381f9SStefano Zampini 
39927e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3993219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
39949371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
39957e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3996fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3997bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
399808391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
39997e8381f9SStefano Zampini   CsrMatrix                            *matrix;
40007e8381f9SStefano Zampini   PetscInt                              n;
40017e8381f9SStefano Zampini 
40027e8381f9SStefano Zampini   PetscFunctionBegin;
400328b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
400428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
40057e8381f9SStefano Zampini   if (!cusp->cooPerm) {
40069566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
40079566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
40087e8381f9SStefano Zampini     PetscFunctionReturn(0);
40097e8381f9SStefano Zampini   }
40107e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
401128b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4012e61fc153SStefano Zampini   if (!v) {
4013e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4014e61fc153SStefano Zampini     goto finalize;
40157e8381f9SStefano Zampini   }
4016e61fc153SStefano Zampini   n = cusp->cooPerm->size();
401708391a17SStefano Zampini   if (isCudaMem(v)) {
401808391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
401908391a17SStefano Zampini   } else {
4020e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4021e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
402208391a17SStefano Zampini     d_v = cooPerm_v->data();
40239566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
402408391a17SStefano Zampini   }
40259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4026e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4027ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4028bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
402908391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4030ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4031ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4032ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4033ddea5d60SJunchao Zhang       */
4034e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4035e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4036e61fc153SStefano Zampini       delete cooPerm_w;
40377e8381f9SStefano Zampini     } else {
4038ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
40399371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40409371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4041ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
40427e8381f9SStefano Zampini     }
40437e8381f9SStefano Zampini   } else {
4044e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
404508391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4046e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
40477e8381f9SStefano Zampini     } else {
40489371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40499371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
40507e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
40517e8381f9SStefano Zampini     }
40527e8381f9SStefano Zampini   }
40539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4054e61fc153SStefano Zampini finalize:
4055e61fc153SStefano Zampini   delete cooPerm_v;
40567e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
40579566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4058fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
40599566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
40609566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
40619566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4062fcdce8c4SStefano Zampini   a->reallocs = 0;
4063fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4064fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4065fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4066fcdce8c4SStefano Zampini   A->num_ass++;
40677e8381f9SStefano Zampini   PetscFunctionReturn(0);
40687e8381f9SStefano Zampini }
40697e8381f9SStefano Zampini 
40709371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) {
4071a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4072a49f1ed0SStefano Zampini 
4073a49f1ed0SStefano Zampini   PetscFunctionBegin;
4074a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4075a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4076a49f1ed0SStefano Zampini   if (destroy) {
40779566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4078a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4079a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4080a49f1ed0SStefano Zampini   }
40811a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
4082a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
4083a49f1ed0SStefano Zampini }
4084a49f1ed0SStefano Zampini 
40857e8381f9SStefano Zampini #include <thrust/binary_search.h>
4086219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
40879371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
40887e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
40897e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
40907e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
40917e8381f9SStefano Zampini 
40927e8381f9SStefano Zampini   PetscFunctionBegin;
40939566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
40949566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
40957e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
40967e8381f9SStefano Zampini   if (n != cooPerm_n) {
40977e8381f9SStefano Zampini     delete cusp->cooPerm;
40987e8381f9SStefano Zampini     delete cusp->cooPerm_a;
40997e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
41007e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
41017e8381f9SStefano Zampini   }
41027e8381f9SStefano Zampini   if (n) {
4103e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4104e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4105e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4106e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4107e8729f6fSJunchao Zhang 
4108e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4109e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4110e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4111e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4112e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4113e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4114e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4115e8729f6fSJunchao Zhang     } else {
4116e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4117e8729f6fSJunchao Zhang     }
4118e8729f6fSJunchao Zhang 
4119e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4120e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4121e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4122e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4123e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4124e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4125e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4126e8729f6fSJunchao Zhang     } else {
4127e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4128e8729f6fSJunchao Zhang     }
4129e8729f6fSJunchao Zhang 
41307e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
41317e8381f9SStefano Zampini 
41327e8381f9SStefano Zampini     if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); }
41337e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
41347e8381f9SStefano Zampini 
4135ddea5d60SJunchao Zhang     /* Ex.
4136ddea5d60SJunchao Zhang       n = 6
4137ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4138ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4139ddea5d60SJunchao Zhang     */
4140e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4141e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
41427e8381f9SStefano Zampini 
41439566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41447e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4145ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4146e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4147e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
41487e8381f9SStefano Zampini 
4149ddea5d60SJunchao Zhang     /*
4150ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4151ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4152ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4153ddea5d60SJunchao Zhang     */
4154ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4155ddea5d60SJunchao Zhang 
4156ddea5d60SJunchao Zhang     /*
4157ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4158ddea5d60SJunchao Zhang                             ^ekey
4159ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4160ddea5d60SJunchao Zhang                            ^nekye
4161ddea5d60SJunchao Zhang     */
41627e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
41637e8381f9SStefano Zampini       delete cusp->cooPerm_a;
41647e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4165ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4166ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4167ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4168ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4169ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
41707e8381f9SStefano Zampini       w[0]                  = 0;
4171ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4172ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
41737e8381f9SStefano Zampini     }
41747e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4175e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4176ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4177ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
41789566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41797e8381f9SStefano Zampini 
41809566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
41817e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
41827e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
41837e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
41849566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4185ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
41869566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
41877e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4188fcdce8c4SStefano Zampini     a->rmax          = 0;
41899566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
41909566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4191e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
41929566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
41939566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
41947e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
41957e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
41967e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
41977e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4198fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
41997e8381f9SStefano Zampini     }
4200fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
42017e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
42029566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
42039566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4204e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4205e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
42067e8381f9SStefano Zampini   } else {
42079566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
42087e8381f9SStefano Zampini   }
42099566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
42107e8381f9SStefano Zampini 
42117e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4212e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
42139566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
42149566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
42157e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
42169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
42179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
42187e8381f9SStefano Zampini   PetscFunctionReturn(0);
42197e8381f9SStefano Zampini }
4220ed502f03SStefano Zampini 
42219371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
4222219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4223219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4224cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4225219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4226219fbbafSJunchao Zhang 
4227219fbbafSJunchao Zhang   PetscFunctionBegin;
42289566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
42299566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4230219fbbafSJunchao Zhang   if (coo_i) {
42319566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4232219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4233219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
42349371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
42359371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
42369371c9d4SSatish Balay           break;
42379371c9d4SSatish Balay         }
4238219fbbafSJunchao Zhang       }
4239219fbbafSJunchao Zhang     }
4240219fbbafSJunchao Zhang   }
4241219fbbafSJunchao Zhang 
4242219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
42439566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4244219fbbafSJunchao Zhang   } else {
42459566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4246cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
42479566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4248219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4249219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
42509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
42519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
42529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
42539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4254219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4255219fbbafSJunchao Zhang   }
4256219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4257219fbbafSJunchao Zhang }
4258219fbbafSJunchao Zhang 
42599371c9d4SSatish Balay __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) {
4260219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4261219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4262b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4263b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4264b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4265b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4266b6c38306SJunchao Zhang   }
4267219fbbafSJunchao Zhang }
4268219fbbafSJunchao Zhang 
42699371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) {
4270219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4271219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4272219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4273219fbbafSJunchao Zhang   PetscMemType        memtype;
4274219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4275219fbbafSJunchao Zhang   PetscScalar        *Aa;
4276219fbbafSJunchao Zhang 
4277219fbbafSJunchao Zhang   PetscFunctionBegin;
4278219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
42799566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4280219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
42819566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
42829566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4283219fbbafSJunchao Zhang     }
4284219fbbafSJunchao Zhang 
42859566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
42869566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4287219fbbafSJunchao Zhang 
4288cbc6b225SStefano Zampini     if (Annz) {
4289b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
42909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4291cbc6b225SStefano Zampini     }
4292219fbbafSJunchao Zhang 
42939566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
42949566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4295219fbbafSJunchao Zhang 
42969566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4297219fbbafSJunchao Zhang   } else {
42989566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4299219fbbafSJunchao Zhang   }
4300219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4301219fbbafSJunchao Zhang }
4302219fbbafSJunchao Zhang 
43035b7e41feSStefano Zampini /*@C
43045b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
43055b7e41feSStefano Zampini 
43065b7e41feSStefano Zampini    Not collective
43075b7e41feSStefano Zampini 
43085b7e41feSStefano Zampini     Input Parameters:
43095b7e41feSStefano Zampini +   A - the matrix
43105b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
43115b7e41feSStefano Zampini 
43125b7e41feSStefano Zampini     Output Parameters:
43135b7e41feSStefano Zampini +   ia - the CSR row pointers
43145b7e41feSStefano Zampini -   ja - the CSR column indices
43155b7e41feSStefano Zampini 
43165b7e41feSStefano Zampini     Level: developer
43175b7e41feSStefano Zampini 
43185b7e41feSStefano Zampini     Notes:
43195b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
43205b7e41feSStefano Zampini 
4321db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
43225b7e41feSStefano Zampini @*/
43239371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
43245f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43255f101d05SStefano Zampini   CsrMatrix          *csr;
43265f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
43275f101d05SStefano Zampini 
43285f101d05SStefano Zampini   PetscFunctionBegin;
43295f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43305f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
43315f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4332aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
43339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
433428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
43355f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
43365f101d05SStefano Zampini   if (i) {
43375f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
43385f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
43395f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
43405f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
43419566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
43425f101d05SStefano Zampini       }
43435f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
43445f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
43455f101d05SStefano Zampini   }
43465f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
43475f101d05SStefano Zampini   PetscFunctionReturn(0);
43485f101d05SStefano Zampini }
43495f101d05SStefano Zampini 
43505b7e41feSStefano Zampini /*@C
43515b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
43525b7e41feSStefano Zampini 
43535b7e41feSStefano Zampini    Not collective
43545b7e41feSStefano Zampini 
43555b7e41feSStefano Zampini     Input Parameters:
43565b7e41feSStefano Zampini +   A - the matrix
43575b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
43585b7e41feSStefano Zampini 
43595b7e41feSStefano Zampini     Output Parameters:
43605b7e41feSStefano Zampini +   ia - the CSR row pointers
43615b7e41feSStefano Zampini -   ja - the CSR column indices
43625b7e41feSStefano Zampini 
43635b7e41feSStefano Zampini     Level: developer
43645b7e41feSStefano Zampini 
4365db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
43665b7e41feSStefano Zampini @*/
43679371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
43685f101d05SStefano Zampini   PetscFunctionBegin;
43695f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43705f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43715f101d05SStefano Zampini   if (i) *i = NULL;
43725f101d05SStefano Zampini   if (j) *j = NULL;
43735f101d05SStefano Zampini   PetscFunctionReturn(0);
43745f101d05SStefano Zampini }
43755f101d05SStefano Zampini 
43765b7e41feSStefano Zampini /*@C
43775b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
43785b7e41feSStefano Zampini 
43795b7e41feSStefano Zampini    Not Collective
43805b7e41feSStefano Zampini 
43815b7e41feSStefano Zampini    Input Parameter:
43825b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43835b7e41feSStefano Zampini 
43845b7e41feSStefano Zampini    Output Parameter:
43855b7e41feSStefano Zampini .   a - pointer to the device data
43865b7e41feSStefano Zampini 
43875b7e41feSStefano Zampini    Level: developer
43885b7e41feSStefano Zampini 
43895b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
43905b7e41feSStefano Zampini 
4391db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
43925b7e41feSStefano Zampini @*/
43939371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) {
4394ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4395ed502f03SStefano Zampini   CsrMatrix          *csr;
4396ed502f03SStefano Zampini 
4397ed502f03SStefano Zampini   PetscFunctionBegin;
4398ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4399ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4400ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4401aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
440328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4404ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
440528b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4406ed502f03SStefano Zampini   *a = csr->values->data().get();
4407ed502f03SStefano Zampini   PetscFunctionReturn(0);
4408ed502f03SStefano Zampini }
4409ed502f03SStefano Zampini 
44105b7e41feSStefano Zampini /*@C
44115b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
44125b7e41feSStefano Zampini 
44135b7e41feSStefano Zampini    Not Collective
44145b7e41feSStefano Zampini 
44155b7e41feSStefano Zampini    Input Parameter:
44165b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
44175b7e41feSStefano Zampini 
44185b7e41feSStefano Zampini    Output Parameter:
44195b7e41feSStefano Zampini .   a - pointer to the device data
44205b7e41feSStefano Zampini 
44215b7e41feSStefano Zampini    Level: developer
44225b7e41feSStefano Zampini 
4423db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
44245b7e41feSStefano Zampini @*/
44259371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) {
4426ed502f03SStefano Zampini   PetscFunctionBegin;
4427ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4428ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4429ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4430ed502f03SStefano Zampini   *a = NULL;
4431ed502f03SStefano Zampini   PetscFunctionReturn(0);
4432ed502f03SStefano Zampini }
4433ed502f03SStefano Zampini 
44345b7e41feSStefano Zampini /*@C
44355b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
44365b7e41feSStefano Zampini 
44375b7e41feSStefano Zampini    Not Collective
44385b7e41feSStefano Zampini 
44395b7e41feSStefano Zampini    Input Parameter:
44405b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
44415b7e41feSStefano Zampini 
44425b7e41feSStefano Zampini    Output Parameter:
44435b7e41feSStefano Zampini .   a - pointer to the device data
44445b7e41feSStefano Zampini 
44455b7e41feSStefano Zampini    Level: developer
44465b7e41feSStefano Zampini 
44475b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
44485b7e41feSStefano Zampini 
4449db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
44505b7e41feSStefano Zampini @*/
44519371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) {
4452039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4453039c6fbaSStefano Zampini   CsrMatrix          *csr;
4454039c6fbaSStefano Zampini 
4455039c6fbaSStefano Zampini   PetscFunctionBegin;
4456039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4457039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4458039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4459aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
446128b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4462039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
446328b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4464039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4465039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
44669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4467039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4468039c6fbaSStefano Zampini }
44695b7e41feSStefano Zampini /*@C
44705b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4471039c6fbaSStefano Zampini 
44725b7e41feSStefano Zampini    Not Collective
44735b7e41feSStefano Zampini 
44745b7e41feSStefano Zampini    Input Parameter:
44755b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
44765b7e41feSStefano Zampini 
44775b7e41feSStefano Zampini    Output Parameter:
44785b7e41feSStefano Zampini .   a - pointer to the device data
44795b7e41feSStefano Zampini 
44805b7e41feSStefano Zampini    Level: developer
44815b7e41feSStefano Zampini 
4482db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
44835b7e41feSStefano Zampini @*/
44849371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) {
4485039c6fbaSStefano Zampini   PetscFunctionBegin;
4486039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4487039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4488039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44899566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
44909566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4491039c6fbaSStefano Zampini   *a = NULL;
4492039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4493039c6fbaSStefano Zampini }
4494039c6fbaSStefano Zampini 
44955b7e41feSStefano Zampini /*@C
44965b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
44975b7e41feSStefano Zampini 
44985b7e41feSStefano Zampini    Not Collective
44995b7e41feSStefano Zampini 
45005b7e41feSStefano Zampini    Input Parameter:
45015b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
45025b7e41feSStefano Zampini 
45035b7e41feSStefano Zampini    Output Parameter:
45045b7e41feSStefano Zampini .   a - pointer to the device data
45055b7e41feSStefano Zampini 
45065b7e41feSStefano Zampini    Level: developer
45075b7e41feSStefano Zampini 
45085b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
45095b7e41feSStefano Zampini 
4510db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
45115b7e41feSStefano Zampini @*/
45129371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) {
4513ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4514ed502f03SStefano Zampini   CsrMatrix          *csr;
4515ed502f03SStefano Zampini 
4516ed502f03SStefano Zampini   PetscFunctionBegin;
4517ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4518ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4519ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4520aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
452128b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4522ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
452328b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4524ed502f03SStefano Zampini   *a             = csr->values->data().get();
4525039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4527ed502f03SStefano Zampini   PetscFunctionReturn(0);
4528ed502f03SStefano Zampini }
4529ed502f03SStefano Zampini 
45305b7e41feSStefano Zampini /*@C
45315b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
45325b7e41feSStefano Zampini 
45335b7e41feSStefano Zampini    Not Collective
45345b7e41feSStefano Zampini 
45355b7e41feSStefano Zampini    Input Parameter:
45365b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
45375b7e41feSStefano Zampini 
45385b7e41feSStefano Zampini    Output Parameter:
45395b7e41feSStefano Zampini .   a - pointer to the device data
45405b7e41feSStefano Zampini 
45415b7e41feSStefano Zampini    Level: developer
45425b7e41feSStefano Zampini 
4543db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
45445b7e41feSStefano Zampini @*/
45459371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) {
4546ed502f03SStefano Zampini   PetscFunctionBegin;
4547ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4548ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4549ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45519566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4552ed502f03SStefano Zampini   *a = NULL;
4553ed502f03SStefano Zampini   PetscFunctionReturn(0);
4554ed502f03SStefano Zampini }
4555ed502f03SStefano Zampini 
45569371c9d4SSatish Balay struct IJCompare4 {
45579371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) {
4558ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4559ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4560ed502f03SStefano Zampini     return false;
4561ed502f03SStefano Zampini   }
4562ed502f03SStefano Zampini };
4563ed502f03SStefano Zampini 
45649371c9d4SSatish Balay struct Shift {
4565ed502f03SStefano Zampini   int _shift;
4566ed502f03SStefano Zampini 
4567ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
45689371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4569ed502f03SStefano Zampini };
4570ed502f03SStefano Zampini 
4571ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
45729371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
4573ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4574ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4575ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4576ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4577ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4578ed502f03SStefano Zampini   cusparseStatus_t              stat;
4579ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4580ed502f03SStefano Zampini 
4581ed502f03SStefano Zampini   PetscFunctionBegin;
4582ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4583ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4584ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4585ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4586ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
45875f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
458808401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4589aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4590aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4591ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4592ed502f03SStefano Zampini     m = A->rmap->n;
4593ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
45949566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
45959566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
45969566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4597ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4598ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4599ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4600ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4601ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4602ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4603ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4604ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4605ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4606ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4607ed502f03SStefano Zampini     Ccusp->nrows            = m;
4608ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4609ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4610ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4611ed502f03SStefano Zampini     Ccsr->num_cols          = n;
46129566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
46139566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
46149566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
46159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
46169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
46179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
46189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46219566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46229566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
462328b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
462428b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4625ed502f03SStefano Zampini 
4626ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4627ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4628ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4629ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4630ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4631ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4632ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4633ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4634ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4635ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4636ed502f03SStefano Zampini     if (c->nz) {
46372ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
46382ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
46392ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
46402ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
46412ed87e7eSStefano Zampini 
4642ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4643ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4644ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4645ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
46469566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4647ed502f03SStefano Zampini         }
46482ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
46492ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4650ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4651ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4652ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4653ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
46549566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4655ed502f03SStefano Zampini         }
46562ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
46572ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
46589566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
46599371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46609371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46619371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46629371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46632ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
46642ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
46652ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
46668909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4667ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4668ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
46698909a122SStefano Zampini #else
46708909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
46718909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
46728909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
46738909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
46748909a122SStefano Zampini #endif
46752ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
46762ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
46772ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
46782ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
46792ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
46802ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4681ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4682ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4683ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4684792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
46858909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
46868909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
46878909a122SStefano Zampini #endif
46882ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
46892ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
46902ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4691792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
46922ed87e7eSStefano Zampini #else
46932ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4694792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4695792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
46962ed87e7eSStefano Zampini #endif
46979371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46989371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46999566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
47002ed87e7eSStefano Zampini       delete wPerm;
47012ed87e7eSStefano Zampini       delete Acoo;
47022ed87e7eSStefano Zampini       delete Bcoo;
47032ed87e7eSStefano Zampini       delete Ccoo;
4704ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47059371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47069371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4707ed502f03SStefano Zampini #endif
47081a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
47099566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
47109566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4711ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4712ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4713ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4714ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4715ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4716ed502f03SStefano Zampini 
47171a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
47181a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4719a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4720ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4721ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4722ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4723ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4724ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4725ed502f03SStefano Zampini 
4726ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4727ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4728ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4729ed502f03SStefano Zampini 
47309566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4731ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4732ed502f03SStefano Zampini         if (AT) {
4733ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4734ed502f03SStefano Zampini           thrust::advance(rT, -1);
4735ed502f03SStefano Zampini         }
4736ed502f03SStefano Zampini         if (BT) {
4737ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4738ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4739ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4740ed502f03SStefano Zampini         }
4741ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4742ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4743ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4744ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4745ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4746ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
47479566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4748ed502f03SStefano Zampini 
47499566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
47509566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
47519566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
47529566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
47539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
47549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
47559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4758ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47599371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47609371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4761ed502f03SStefano Zampini #endif
4762ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4763ed502f03SStefano Zampini       }
4764ed502f03SStefano Zampini     }
4765ed502f03SStefano Zampini 
4766ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4767ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4768ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
47699566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
47709566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
4771ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4772ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4773ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4774ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4775ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
47769566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47779566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4778ed502f03SStefano Zampini     } else {
47799566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47809566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4781ed502f03SStefano Zampini     }
47829566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
47839566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
47849566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4785ed502f03SStefano Zampini     c->maxnz         = c->nz;
4786ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4787ed502f03SStefano Zampini     c->rmax          = 0;
4788ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4789ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4790ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4791ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4792ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4793ed502f03SStefano Zampini     }
47949566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
47959566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4796ed502f03SStefano Zampini     (*C)->nonzerostate++;
47979566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
47989566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4799ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4800ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4801ed502f03SStefano Zampini   } else {
480208401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4803ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4804ed502f03SStefano Zampini     if (c->nz) {
4805ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
48065f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4807aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
480808401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
48099566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
48109566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
48115f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
48125f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4813ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4814ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4815ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4816aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4817aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4818aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4819aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
48205f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4821ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4822ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
48239566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48249371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
48259371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4826ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
48279371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
48289371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4829ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
48309566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
48311a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
48325f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4833ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4834ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4835ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4836ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4837ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4838ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4839ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48401a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4841ed502f03SStefano Zampini       }
48429566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4843ed502f03SStefano Zampini     }
4844ed502f03SStefano Zampini   }
48459566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4846ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4847ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4848ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4849ed502f03SStefano Zampini   PetscFunctionReturn(0);
4850ed502f03SStefano Zampini }
4851c215019aSStefano Zampini 
48529371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) {
4853c215019aSStefano Zampini   bool               dmem;
4854c215019aSStefano Zampini   const PetscScalar *av;
4855c215019aSStefano Zampini 
4856c215019aSStefano Zampini   PetscFunctionBegin;
4857c215019aSStefano Zampini   dmem = isCudaMem(v);
48589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4859c215019aSStefano Zampini   if (n && idx) {
4860c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4861c215019aSStefano Zampini     widx.assign(idx, idx + n);
48629566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4863c215019aSStefano Zampini 
4864c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4865c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4866c215019aSStefano Zampini     if (dmem) {
4867c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4868c215019aSStefano Zampini     } else {
4869c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4870c215019aSStefano Zampini       dv = w->data();
4871c215019aSStefano Zampini     }
4872c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4873c215019aSStefano Zampini 
4874c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4875c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4876c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
4877*48a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4878c215019aSStefano Zampini     delete w;
4879c215019aSStefano Zampini   } else {
48809566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4881c215019aSStefano Zampini   }
48829566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
48839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4884c215019aSStefano Zampini   PetscFunctionReturn(0);
4885c215019aSStefano Zampini }
4886