xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 11a5261e40035b7c793f2783a2ba6c7cd4f3b077)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
24afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
55afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
56afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
71dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
92e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
94c215019aSStefano Zampini 
959371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
96aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
976e111a19SKarl Rupp 
98ca45077fSPaul Mullowney   PetscFunctionBegin;
99ca45077fSPaul Mullowney   switch (op) {
1009371c9d4SSatish Balay   case MAT_CUSPARSE_MULT: cusparsestruct->format = format; break;
1019371c9d4SSatish Balay   case MAT_CUSPARSE_ALL: cusparsestruct->format = format; break;
1029371c9d4SSatish Balay   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
103ca45077fSPaul Mullowney   }
104ca45077fSPaul Mullowney   PetscFunctionReturn(0);
105ca45077fSPaul Mullowney }
1069ae82921SPaul Mullowney 
107e057df02SPaul Mullowney /*@
108*11a5261eSBarry Smith    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
109*11a5261eSBarry Smith    operation. Only the `MatMult()` operation can use different GPU storage formats
110*11a5261eSBarry Smith 
111e057df02SPaul Mullowney    Not Collective
112e057df02SPaul Mullowney 
113e057df02SPaul Mullowney    Input Parameters:
114*11a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
115*11a5261eSBarry Smith .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
116*11a5261eSBarry Smith         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
117*11a5261eSBarry Smith -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
118e057df02SPaul Mullowney 
119e057df02SPaul Mullowney    Output Parameter:
120e057df02SPaul Mullowney 
121e057df02SPaul Mullowney    Level: intermediate
122e057df02SPaul Mullowney 
123*11a5261eSBarry Smith .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
124e057df02SPaul Mullowney @*/
1259371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
126e057df02SPaul Mullowney   PetscFunctionBegin;
127e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
128cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
129e057df02SPaul Mullowney   PetscFunctionReturn(0);
130e057df02SPaul Mullowney }
131e057df02SPaul Mullowney 
1329371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) {
133365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
134365b711fSMark Adams 
135365b711fSMark Adams   PetscFunctionBegin;
136365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
137365b711fSMark Adams   PetscFunctionReturn(0);
138365b711fSMark Adams }
139365b711fSMark Adams 
140365b711fSMark Adams /*@
141*11a5261eSBarry Smith    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
142365b711fSMark Adams 
143365b711fSMark Adams    Input Parameters:
144*11a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
145*11a5261eSBarry Smith -  use_cpu - set flag for using the built-in CPU `MatSolve()`
146365b711fSMark Adams 
147365b711fSMark Adams    Output Parameter:
148365b711fSMark Adams 
149*11a5261eSBarry Smith    Note:
150365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
151365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
152365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
153365b711fSMark Adams 
154365b711fSMark Adams    Level: intermediate
155365b711fSMark Adams 
156*11a5261eSBarry Smith .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
157365b711fSMark Adams @*/
1589371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) {
159365b711fSMark Adams   PetscFunctionBegin;
160365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
161cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
162365b711fSMark Adams   PetscFunctionReturn(0);
163365b711fSMark Adams }
164365b711fSMark Adams 
1659371c9d4SSatish Balay PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) {
166e6e9a74fSStefano Zampini   PetscFunctionBegin;
1671a2c6b5cSJunchao Zhang   switch (op) {
1681a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1691a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1709566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1711a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1721a2c6b5cSJunchao Zhang     break;
1739371c9d4SSatish Balay   default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break;
174e6e9a74fSStefano Zampini   }
175e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
176e6e9a74fSStefano Zampini }
177e6e9a74fSStefano Zampini 
178bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
179bddcd29dSMark Adams 
1809371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
181bddcd29dSMark Adams   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
182bddcd29dSMark Adams   IS                  isrow = b->row, iscol = b->col;
183bddcd29dSMark Adams   PetscBool           row_identity, col_identity;
184365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
185bddcd29dSMark Adams 
186bddcd29dSMark Adams   PetscFunctionBegin;
1879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
1889566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
189bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
190bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
1919566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
1929566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
193f93f8571SJunchao Zhang 
194365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
195f93f8571SJunchao Zhang     if (row_identity && col_identity) {
196bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
197bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
198bddcd29dSMark Adams     } else {
199bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
200bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
201365b711fSMark Adams     }
202f93f8571SJunchao Zhang   }
203bddcd29dSMark Adams   B->ops->matsolve          = NULL;
204bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
205bddcd29dSMark Adams 
206bddcd29dSMark Adams   /* get the triangular factors */
20748a46eb9SPierre Jolivet   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
208bddcd29dSMark Adams   PetscFunctionReturn(0);
209bddcd29dSMark Adams }
210bddcd29dSMark Adams 
2119371c9d4SSatish Balay static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
212e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2139ae82921SPaul Mullowney   PetscBool                flg;
214a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2156e111a19SKarl Rupp 
2169ae82921SPaul Mullowney   PetscFunctionBegin;
217d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
2189ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2199371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2209566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
221afb2bd1cSJunchao Zhang 
2229371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2239566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2249566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2259566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
226afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2279371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
228afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
229ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
230aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
231a435da06SStefano Zampini #else
232aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
233a435da06SStefano Zampini #endif
2349371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
235aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
236afb2bd1cSJunchao Zhang 
2379371c9d4SSatish Balay     PetscCall(
2389371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
239aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
240afb2bd1cSJunchao Zhang #endif
2414c87dfd4SPaul Mullowney   }
242d0609cedSBarry Smith   PetscOptionsHeadEnd();
2439ae82921SPaul Mullowney   PetscFunctionReturn(0);
2449ae82921SPaul Mullowney }
2459ae82921SPaul Mullowney 
2469371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) {
2479ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
2489ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
2499ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
250aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
2519ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
2529ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
2539ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2549ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
2559ae82921SPaul Mullowney 
2569ae82921SPaul Mullowney   PetscFunctionBegin;
257cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
258c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2599ae82921SPaul Mullowney     try {
2609ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2619ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
262da79fbbcSStefano Zampini       if (!loTriFactor) {
2632cbc15d9SMark         PetscScalar *AALo;
2642cbc15d9SMark 
2659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
2669ae82921SPaul Mullowney 
2679ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
2699566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
2709ae82921SPaul Mullowney 
2719ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2729ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
2739ae82921SPaul Mullowney         AiLo[n]   = nzLower;
2749ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
2759ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
2769ae82921SPaul Mullowney         v         = aa;
2779ae82921SPaul Mullowney         vi        = aj;
2789ae82921SPaul Mullowney         offset    = 1;
2799ae82921SPaul Mullowney         rowOffset = 1;
2809ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
2819ae82921SPaul Mullowney           nz      = ai[i + 1] - ai[i];
282e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
2839ae82921SPaul Mullowney           AiLo[i] = rowOffset;
2849ae82921SPaul Mullowney           rowOffset += nz + 1;
2859ae82921SPaul Mullowney 
2869566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
2879566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
2889ae82921SPaul Mullowney 
2899ae82921SPaul Mullowney           offset += nz;
2909ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
2919ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
2929ae82921SPaul Mullowney           offset += 1;
2939ae82921SPaul Mullowney 
2949ae82921SPaul Mullowney           v += nz;
2959ae82921SPaul Mullowney           vi += nz;
2969ae82921SPaul Mullowney         }
2972205254eSKarl Rupp 
298aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
2999566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
300da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
301aa372e3fSPaul Mullowney         /* Create the matrix description */
3029566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3039566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3059566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
306afb2bd1cSJunchao Zhang #else
3079566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
308afb2bd1cSJunchao Zhang #endif
3099566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3109566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
311aa372e3fSPaul Mullowney 
312aa372e3fSPaul Mullowney         /* set the operation */
313aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
314aa372e3fSPaul Mullowney 
315aa372e3fSPaul Mullowney         /* set the matrix */
316aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
317aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
318aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
319aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
320aa372e3fSPaul Mullowney 
321aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
322aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
323aa372e3fSPaul Mullowney 
324aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
325aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
326aa372e3fSPaul Mullowney 
327aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
328aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
329aa372e3fSPaul Mullowney 
330afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3319566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
332261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3349371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3359371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3369566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
337afb2bd1cSJunchao Zhang #endif
338afb2bd1cSJunchao Zhang 
339aa372e3fSPaul Mullowney         /* perform the solve analysis */
3409371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3419371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
3421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3439371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
344d49cd2b7SBarry Smith #else
3455f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
346afb2bd1cSJunchao Zhang #endif
3479566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3489566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
349aa372e3fSPaul Mullowney 
350da79fbbcSStefano Zampini         /* assign the pointer */
351aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
3522cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
3539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3559566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
356da79fbbcSStefano Zampini       } else { /* update values only */
35748a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
358da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3592cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
360da79fbbcSStefano Zampini         v                    = aa;
361da79fbbcSStefano Zampini         vi                   = aj;
362da79fbbcSStefano Zampini         offset               = 1;
363da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
364da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
3659566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
366da79fbbcSStefano Zampini           offset += nz;
3672cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
368da79fbbcSStefano Zampini           offset += 1;
369da79fbbcSStefano Zampini           v += nz;
370da79fbbcSStefano Zampini         }
3712cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
3729566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
373da79fbbcSStefano Zampini       }
3749371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3759ae82921SPaul Mullowney   }
3769ae82921SPaul Mullowney   PetscFunctionReturn(0);
3779ae82921SPaul Mullowney }
3789ae82921SPaul Mullowney 
3799371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) {
3809ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3819ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3829ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
383aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
3849ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
3859ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
3869ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
3879ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
3889ae82921SPaul Mullowney 
3899ae82921SPaul Mullowney   PetscFunctionBegin;
390cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
391c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3929ae82921SPaul Mullowney     try {
3939ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
3949ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
395da79fbbcSStefano Zampini       if (!upTriFactor) {
3962cbc15d9SMark         PetscScalar *AAUp;
3972cbc15d9SMark 
3989566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
3992cbc15d9SMark 
4009ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4019566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4029566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4039ae82921SPaul Mullowney 
4049ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4059ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4069ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4079ae82921SPaul Mullowney         offset  = nzUpper;
4089ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4099ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4109ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4119ae82921SPaul Mullowney 
412e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4139ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4149ae82921SPaul Mullowney 
415e057df02SPaul Mullowney           /* decrement the offset */
4169ae82921SPaul Mullowney           offset -= (nz + 1);
4179ae82921SPaul Mullowney 
418e057df02SPaul Mullowney           /* first, set the diagonal elements */
4199ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
42009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4219ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4229ae82921SPaul Mullowney 
4239566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4249566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4259ae82921SPaul Mullowney         }
4262205254eSKarl Rupp 
427aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4289566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
429da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4302205254eSKarl Rupp 
431aa372e3fSPaul Mullowney         /* Create the matrix description */
4329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4341b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4359566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
436afb2bd1cSJunchao Zhang #else
4379566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
438afb2bd1cSJunchao Zhang #endif
4399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4409566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
441aa372e3fSPaul Mullowney 
442aa372e3fSPaul Mullowney         /* set the operation */
443aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
444aa372e3fSPaul Mullowney 
445aa372e3fSPaul Mullowney         /* set the matrix */
446aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
447aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
448aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
449aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
450aa372e3fSPaul Mullowney 
451aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
452aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
453aa372e3fSPaul Mullowney 
454aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
455aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
456aa372e3fSPaul Mullowney 
457aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
458aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
459aa372e3fSPaul Mullowney 
460afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4619566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
462261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4649371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4659371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
4669566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
467afb2bd1cSJunchao Zhang #endif
468afb2bd1cSJunchao Zhang 
469aa372e3fSPaul Mullowney         /* perform the solve analysis */
4709371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4719371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
4721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4739371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
474d49cd2b7SBarry Smith #else
4755f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
476afb2bd1cSJunchao Zhang #endif
4779566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4789566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
479aa372e3fSPaul Mullowney 
480da79fbbcSStefano Zampini         /* assign the pointer */
481aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
4822cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
4839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
4849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
4859566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
486da79fbbcSStefano Zampini       } else {
48748a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
488da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
489da79fbbcSStefano Zampini         offset = nzUpper;
490da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
491da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
492da79fbbcSStefano Zampini 
493da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
494da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
495da79fbbcSStefano Zampini 
496da79fbbcSStefano Zampini           /* decrement the offset */
497da79fbbcSStefano Zampini           offset -= (nz + 1);
498da79fbbcSStefano Zampini 
499da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5002cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5019566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
502da79fbbcSStefano Zampini         }
5032cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5049566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
505da79fbbcSStefano Zampini       }
5069371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
5079ae82921SPaul Mullowney   }
5089ae82921SPaul Mullowney   PetscFunctionReturn(0);
5099ae82921SPaul Mullowney }
5109ae82921SPaul Mullowney 
5119371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) {
5129ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5139ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5149ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5159ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5169ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5179ae82921SPaul Mullowney 
5189ae82921SPaul Mullowney   PetscFunctionBegin;
51928b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
5209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5222205254eSKarl Rupp 
523ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
524aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5259ae82921SPaul Mullowney 
526c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
527e057df02SPaul Mullowney   /* lower triangular indices */
5289566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
529da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
530da79fbbcSStefano Zampini     const PetscInt *r;
531da79fbbcSStefano Zampini 
5329566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
533aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
534aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5359566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5369566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
537da79fbbcSStefano Zampini   }
5389ae82921SPaul Mullowney 
539e057df02SPaul Mullowney   /* upper triangular indices */
5409566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
541da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
542da79fbbcSStefano Zampini     const PetscInt *c;
543da79fbbcSStefano Zampini 
5449566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
545aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
546aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
5479566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
5489566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
549da79fbbcSStefano Zampini   }
5509ae82921SPaul Mullowney   PetscFunctionReturn(0);
5519ae82921SPaul Mullowney }
5529ae82921SPaul Mullowney 
5539371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) {
554087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
555087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
556aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
557aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
558087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
559087f3262SPaul Mullowney   PetscScalar                       *AAUp;
560087f3262SPaul Mullowney   PetscScalar                       *AALo;
561087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
562087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
563087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
564087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
565087f3262SPaul Mullowney 
566087f3262SPaul Mullowney   PetscFunctionBegin;
567cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
568c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
569087f3262SPaul Mullowney     try {
5709566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
5719566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
572da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
573087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
5759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
576087f3262SPaul Mullowney 
577087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
578087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
579087f3262SPaul Mullowney         AiUp[n] = nzUpper;
580087f3262SPaul Mullowney         offset  = 0;
581087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
582087f3262SPaul Mullowney           /* set the pointers */
583087f3262SPaul Mullowney           v  = aa + ai[i];
584087f3262SPaul Mullowney           vj = aj + ai[i];
585087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
586087f3262SPaul Mullowney 
587087f3262SPaul Mullowney           /* first, set the diagonal elements */
588087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
58909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
590087f3262SPaul Mullowney           AiUp[i]      = offset;
59109f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
592087f3262SPaul Mullowney 
593087f3262SPaul Mullowney           offset += 1;
594087f3262SPaul Mullowney           if (nz > 0) {
5959566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
5969566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
597087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
598087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
599087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
600087f3262SPaul Mullowney             }
601087f3262SPaul Mullowney             offset += nz;
602087f3262SPaul Mullowney           }
603087f3262SPaul Mullowney         }
604087f3262SPaul Mullowney 
605aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6069566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
607da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
608087f3262SPaul Mullowney 
609aa372e3fSPaul Mullowney         /* Create the matrix description */
6109566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6119566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6121b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6139566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
614afb2bd1cSJunchao Zhang #else
6159566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
616afb2bd1cSJunchao Zhang #endif
6179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6189566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
619087f3262SPaul Mullowney 
620aa372e3fSPaul Mullowney         /* set the matrix */
621aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
622aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
623aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
624aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
625aa372e3fSPaul Mullowney 
626aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
627aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
628aa372e3fSPaul Mullowney 
629aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
630aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
631aa372e3fSPaul Mullowney 
632aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
633aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
634aa372e3fSPaul Mullowney 
635afb2bd1cSJunchao Zhang         /* set the operation */
636afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
637afb2bd1cSJunchao Zhang 
638afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6399566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
640261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6429371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6439371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
6449566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
645afb2bd1cSJunchao Zhang #endif
646afb2bd1cSJunchao Zhang 
647aa372e3fSPaul Mullowney         /* perform the solve analysis */
6489371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6499371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
6501b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6519371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
652d49cd2b7SBarry Smith #else
6535f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
654afb2bd1cSJunchao Zhang #endif
6559566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6569566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
657aa372e3fSPaul Mullowney 
658da79fbbcSStefano Zampini         /* assign the pointer */
659aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
660aa372e3fSPaul Mullowney 
661aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6629566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
663da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
664aa372e3fSPaul Mullowney 
665aa372e3fSPaul Mullowney         /* Create the matrix description */
6669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
6679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
670afb2bd1cSJunchao Zhang #else
6719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
672afb2bd1cSJunchao Zhang #endif
6739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
675aa372e3fSPaul Mullowney 
676aa372e3fSPaul Mullowney         /* set the operation */
677aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
678aa372e3fSPaul Mullowney 
679aa372e3fSPaul Mullowney         /* set the matrix */
680aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
681aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
682aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
683aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
684aa372e3fSPaul Mullowney 
685aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
686aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
687aa372e3fSPaul Mullowney 
688aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
689aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
690aa372e3fSPaul Mullowney 
691aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
692aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
693aa372e3fSPaul Mullowney 
694afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6959566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
696261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
6971b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6989371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
6999371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
7009566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
701afb2bd1cSJunchao Zhang #endif
702afb2bd1cSJunchao Zhang 
703aa372e3fSPaul Mullowney         /* perform the solve analysis */
7049371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7059371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
7061b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7079371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
708d49cd2b7SBarry Smith #else
7095f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
710afb2bd1cSJunchao Zhang #endif
7119566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7129566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
713aa372e3fSPaul Mullowney 
714da79fbbcSStefano Zampini         /* assign the pointer */
715aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
716087f3262SPaul Mullowney 
7179566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
7189566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7199566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
720da79fbbcSStefano Zampini       } else {
721da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
722da79fbbcSStefano Zampini         offset = 0;
723da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
724da79fbbcSStefano Zampini           /* set the pointers */
725da79fbbcSStefano Zampini           v  = aa + ai[i];
726da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
727da79fbbcSStefano Zampini 
728da79fbbcSStefano Zampini           /* first, set the diagonal elements */
729da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
730da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
731da79fbbcSStefano Zampini 
732da79fbbcSStefano Zampini           offset += 1;
733da79fbbcSStefano Zampini           if (nz > 0) {
7349566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
735da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
736da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
737da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
738da79fbbcSStefano Zampini             }
739da79fbbcSStefano Zampini             offset += nz;
740da79fbbcSStefano Zampini           }
741da79fbbcSStefano Zampini         }
74228b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
74328b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
744da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
745da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
7469566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
747da79fbbcSStefano Zampini       }
7489566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
7499566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
7509371c9d4SSatish Balay     } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
751087f3262SPaul Mullowney   }
752087f3262SPaul Mullowney   PetscFunctionReturn(0);
753087f3262SPaul Mullowney }
754087f3262SPaul Mullowney 
7559371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) {
756087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
757087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
758087f3262SPaul Mullowney   IS                            ip                 = a->row;
759087f3262SPaul Mullowney   PetscBool                     perm_identity;
760087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
761087f3262SPaul Mullowney 
762087f3262SPaul Mullowney   PetscFunctionBegin;
76328b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
7649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
765ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
766aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
767aa372e3fSPaul Mullowney 
768da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
769da79fbbcSStefano Zampini 
770087f3262SPaul Mullowney   /* lower triangular indices */
7719566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
772087f3262SPaul Mullowney   if (!perm_identity) {
7734e4bbfaaSStefano Zampini     IS              iip;
774da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
7754e4bbfaaSStefano Zampini 
7769566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
7779566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
7789566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
779aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
780aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
781aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7824e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
7839566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
7849566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
7859566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
7869566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
787da79fbbcSStefano Zampini   }
788087f3262SPaul Mullowney   PetscFunctionReturn(0);
789087f3262SPaul Mullowney }
790087f3262SPaul Mullowney 
7919371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) {
792087f3262SPaul Mullowney   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
793087f3262SPaul Mullowney   IS          ip = b->row;
794087f3262SPaul Mullowney   PetscBool   perm_identity;
795087f3262SPaul Mullowney 
796087f3262SPaul Mullowney   PetscFunctionBegin;
7979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
7989566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
799ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
800087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8019566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
802087f3262SPaul Mullowney   if (perm_identity) {
803087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
804087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8054e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8064e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
807087f3262SPaul Mullowney   } else {
808087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
809087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
8104e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8114e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
812087f3262SPaul Mullowney   }
813087f3262SPaul Mullowney 
814087f3262SPaul Mullowney   /* get the triangular factors */
8159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
816087f3262SPaul Mullowney   PetscFunctionReturn(0);
817087f3262SPaul Mullowney }
8189ae82921SPaul Mullowney 
8199371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) {
820bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
821aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
822aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
823da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
824da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
825aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
826aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
827aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
828aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
829b175d8bbSPaul Mullowney 
830bda325fcSPaul Mullowney   PetscFunctionBegin;
831aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8329566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
833da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
834aa372e3fSPaul Mullowney 
835aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
836aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
837aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
8389371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
839aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
840aa372e3fSPaul Mullowney 
841aa372e3fSPaul Mullowney   /* Create the matrix description */
8429566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
8439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
8449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
8459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
8469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
847aa372e3fSPaul Mullowney 
848aa372e3fSPaul Mullowney   /* set the operation */
849aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
850aa372e3fSPaul Mullowney 
851aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
852aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
853afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
854afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
855aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
856afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
857afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
858afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
859aa372e3fSPaul Mullowney 
860aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
861afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8629371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
8639371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
8649371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
8659566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
866afb2bd1cSJunchao Zhang #endif
867afb2bd1cSJunchao Zhang 
8689566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
8699371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
8709371c9d4SSatish Balay                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
871afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8729371c9d4SSatish Balay                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
873afb2bd1cSJunchao Zhang #else
8749371c9d4SSatish Balay                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
875afb2bd1cSJunchao Zhang #endif
8769566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8779566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
878aa372e3fSPaul Mullowney 
879afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
8809566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
881261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
8821b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8839371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
8849371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
8859566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
886afb2bd1cSJunchao Zhang #endif
887afb2bd1cSJunchao Zhang 
888afb2bd1cSJunchao Zhang   /* perform the solve analysis */
8899371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
8909371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
8911b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
8929371c9d4SSatish Balay                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
893d49cd2b7SBarry Smith #else
8945f80ce2aSJacob Faibussowitsch                                             loTriFactorT->solveInfo));
895afb2bd1cSJunchao Zhang #endif
8969566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8979566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
898aa372e3fSPaul Mullowney 
899da79fbbcSStefano Zampini   /* assign the pointer */
900aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney   /*********************************************/
903aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
904aa372e3fSPaul Mullowney   /*********************************************/
905aa372e3fSPaul Mullowney 
906aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9079566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
908da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
909aa372e3fSPaul Mullowney 
910aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
911aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
912aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
9139371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
914aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
915aa372e3fSPaul Mullowney 
916aa372e3fSPaul Mullowney   /* Create the matrix description */
9179566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
9199566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
9209566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
9219566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
922aa372e3fSPaul Mullowney 
923aa372e3fSPaul Mullowney   /* set the operation */
924aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
925aa372e3fSPaul Mullowney 
926aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
927aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
928afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
929afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
930aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
931afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
932afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
933afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
934aa372e3fSPaul Mullowney 
935aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
936afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9379371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
9389371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
9399371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
9409566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
941afb2bd1cSJunchao Zhang #endif
942afb2bd1cSJunchao Zhang 
9439566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9449371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
9459371c9d4SSatish Balay                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
946afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9479371c9d4SSatish Balay                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
948afb2bd1cSJunchao Zhang #else
9499371c9d4SSatish Balay                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
950afb2bd1cSJunchao Zhang #endif
951d49cd2b7SBarry Smith 
9529566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9539566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
954aa372e3fSPaul Mullowney 
955afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9569566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
957261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
9581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9599371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9609371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
9619566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
962afb2bd1cSJunchao Zhang #endif
963afb2bd1cSJunchao Zhang 
964afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9655f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
9669371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9679371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
9681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9699371c9d4SSatish Balay                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
970d49cd2b7SBarry Smith #else
9715f80ce2aSJacob Faibussowitsch                                             upTriFactorT->solveInfo));
972afb2bd1cSJunchao Zhang #endif
973d49cd2b7SBarry Smith 
9749566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9759566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
976aa372e3fSPaul Mullowney 
977da79fbbcSStefano Zampini   /* assign the pointer */
978aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
979bda325fcSPaul Mullowney   PetscFunctionReturn(0);
980bda325fcSPaul Mullowney }
981bda325fcSPaul Mullowney 
9829371c9d4SSatish Balay struct PetscScalarToPetscInt {
9839371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
984a49f1ed0SStefano Zampini };
985a49f1ed0SStefano Zampini 
9869371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) {
987aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
988a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
989bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
990bda325fcSPaul Mullowney   cusparseStatus_t              stat;
991aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
992b175d8bbSPaul Mullowney 
993bda325fcSPaul Mullowney   PetscFunctionBegin;
9949566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
995a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
99628b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
997a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
99808401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
9991a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
10009566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10019566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
100248a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1003a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1004aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
10059566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1006aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
10079566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
10089566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1009aa372e3fSPaul Mullowney 
1010b06137fdSPaul Mullowney     /* set alpha and beta */
10119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
10129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
10139566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
10149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1017b06137fdSPaul Mullowney 
1018aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1019aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1020a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1021554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1022554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1023aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1024a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1025aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1026aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1027a3fdcf43SKarl Rupp 
1028ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
102981902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1030afb2bd1cSJunchao Zhang 
1031afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10323606e59fSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
10339371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
10349371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
10359371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
10363606e59fSJunchao Zhang #else
10373606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
10383606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
10393606e59fSJunchao Zhang 
10403606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
10413606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
10423606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
10433606e59fSJunchao Zhang         */
10443606e59fSJunchao Zhang       if (matrixT->num_entries) {
10459371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
10469371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
10473606e59fSJunchao Zhang 
10483606e59fSJunchao Zhang       } else {
10493606e59fSJunchao Zhang         matstructT->matDescr = NULL;
10503606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
10513606e59fSJunchao Zhang       }
10523606e59fSJunchao Zhang #endif
1053afb2bd1cSJunchao Zhang #endif
1054aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1055afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1056afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1057afb2bd1cSJunchao Zhang #else
1058aa372e3fSPaul Mullowney       CsrMatrix *temp = new CsrMatrix;
105951c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
106051c6d536SStefano Zampini       /* First convert HYB to CSR */
1061aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1062aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1063aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1064aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1065aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1066aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1067aa372e3fSPaul Mullowney 
10689371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
10699371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1070aa372e3fSPaul Mullowney 
1071aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1072aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1073aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1074aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1075aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1076aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1077aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1078aa372e3fSPaul Mullowney 
10799371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
10809371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
10819371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1082aa372e3fSPaul Mullowney 
1083aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1084aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
10859566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
10869371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
10879371c9d4SSatish Balay       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
10889371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1089aa372e3fSPaul Mullowney 
1090aa372e3fSPaul Mullowney       /* assign the pointer */
1091aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
10921a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1093aa372e3fSPaul Mullowney       /* delete temporaries */
1094aa372e3fSPaul Mullowney       if (tempT) {
1095aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1096aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1097aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1098aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1099087f3262SPaul Mullowney       }
1100aa372e3fSPaul Mullowney       if (temp) {
1101aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1102aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1103aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1104aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1105aa372e3fSPaul Mullowney       }
1106afb2bd1cSJunchao Zhang #endif
1107aa372e3fSPaul Mullowney     }
1108a49f1ed0SStefano Zampini   }
1109a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1110a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1111a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
111228b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
111328b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
111428b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
111528b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
111628b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
111728b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
111828b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
111928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1120a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1121a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1122a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
11239566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1124a49f1ed0SStefano Zampini     }
1125a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1126a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1127792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1128a49f1ed0SStefano Zampini 
1129a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1130a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1131a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1132a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
11339371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
11349371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
11359371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
11369566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1137a49f1ed0SStefano Zampini #endif
1138a49f1ed0SStefano Zampini 
11391a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
11401a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
11411a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
11421a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
11431a2c6b5cSJunchao Zhang 
11441a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
11451a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
11461a2c6b5cSJunchao Zhang         */
11479371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1148a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11499371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
11509371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1151a49f1ed0SStefano Zampini #else
11529371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11539371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1154a49f1ed0SStefano Zampini #endif
11551a2c6b5cSJunchao Zhang       } else {
11561a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
11571a2c6b5cSJunchao Zhang       }
11581a2c6b5cSJunchao Zhang 
1159a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1160792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1161a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11629566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1163a49f1ed0SStefano Zampini #endif
1164a49f1ed0SStefano Zampini     }
11659371c9d4SSatish Balay     PetscCallThrust(
11669371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1167a49f1ed0SStefano Zampini   }
11689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
11699566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1170213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1171213423ffSJunchao Zhang   matstructT->cprowIndices                       = NULL;
1172aa372e3fSPaul Mullowney   /* assign the pointer */
1173aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
11741a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
1175bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1176bda325fcSPaul Mullowney }
1177bda325fcSPaul Mullowney 
1178a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
11799371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1180c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1181465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1182465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1183465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1184465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1185bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1186bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1187aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1188aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1189aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1190bda325fcSPaul Mullowney 
1191bda325fcSPaul Mullowney   PetscFunctionBegin;
1192aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1193aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
11949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1195aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1196aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1197bda325fcSPaul Mullowney   }
1198bda325fcSPaul Mullowney 
1199bda325fcSPaul Mullowney   /* Get the GPU pointers */
12009566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12019566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1202c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1203c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1204bda325fcSPaul Mullowney 
12059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1206aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12079371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1208aa372e3fSPaul Mullowney 
1209aa372e3fSPaul Mullowney   /* First, solve U */
12109371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
12111b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1212afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1213afb2bd1cSJunchao Zhang #endif
12149371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
12151b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12169371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
12179371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1218d49cd2b7SBarry Smith #else
12199371c9d4SSatish Balay                               tempGPU->data().get());
12209371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1221afb2bd1cSJunchao Zhang #endif
1222aa372e3fSPaul Mullowney 
1223aa372e3fSPaul Mullowney   /* Then, solve L */
12249371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
12251b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1226afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1227afb2bd1cSJunchao Zhang #endif
12289371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1229d49cd2b7SBarry Smith                               tempGPU->data().get(),
12301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12319371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
12329371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1233d49cd2b7SBarry Smith #else
12349371c9d4SSatish Balay                               xarray);
12359371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1236afb2bd1cSJunchao Zhang #endif
1237aa372e3fSPaul Mullowney 
1238aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
12399371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1240aa372e3fSPaul Mullowney 
1241aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1242a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1243bda325fcSPaul Mullowney 
1244bda325fcSPaul Mullowney   /* restore */
12459566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12469566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12479566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12489566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1249bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1250bda325fcSPaul Mullowney }
1251bda325fcSPaul Mullowney 
12529371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1253465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1254465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1255bda325fcSPaul Mullowney   cusparseStatus_t                   stat;
1256bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1257aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1258aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1259aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1260bda325fcSPaul Mullowney 
1261bda325fcSPaul Mullowney   PetscFunctionBegin;
1262aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1263aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1265aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1266aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1267bda325fcSPaul Mullowney   }
1268bda325fcSPaul Mullowney 
1269bda325fcSPaul Mullowney   /* Get the GPU pointers */
12709566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12719566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1272bda325fcSPaul Mullowney 
12739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1274aa372e3fSPaul Mullowney   /* First, solve U */
12759371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
12761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1277afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1278afb2bd1cSJunchao Zhang #endif
12799371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
12801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12819371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
12829371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1283d49cd2b7SBarry Smith #else
12849371c9d4SSatish Balay                               tempGPU->data().get());
12859371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1286afb2bd1cSJunchao Zhang #endif
1287aa372e3fSPaul Mullowney 
1288aa372e3fSPaul Mullowney   /* Then, solve L */
12899371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
12901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1291afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1292afb2bd1cSJunchao Zhang #endif
12939371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1294d49cd2b7SBarry Smith                               tempGPU->data().get(),
12951b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12969371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
12979371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1298d49cd2b7SBarry Smith #else
12999371c9d4SSatish Balay                               xarray);
13009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1301afb2bd1cSJunchao Zhang #endif
1302bda325fcSPaul Mullowney 
1303bda325fcSPaul Mullowney   /* restore */
13049566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13059566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13069566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1308bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1309bda325fcSPaul Mullowney }
1310bda325fcSPaul Mullowney 
13119371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) {
1312465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1313465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1314465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1315465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
13169ae82921SPaul Mullowney   cusparseStatus_t                      stat;
13179ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1318aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1319aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1320aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13219ae82921SPaul Mullowney 
13229ae82921SPaul Mullowney   PetscFunctionBegin;
1323ebc8f436SDominic Meiser 
1324e057df02SPaul Mullowney   /* Get the GPU pointers */
13259566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13269566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1327c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1328c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
13299ae82921SPaul Mullowney 
13309566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1331aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
13329371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1333aa372e3fSPaul Mullowney 
1334aa372e3fSPaul Mullowney   /* Next, solve L */
13359371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
13361b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1337afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1338afb2bd1cSJunchao Zhang #endif
13399371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1340d49cd2b7SBarry Smith                               tempGPU->data().get(),
13411b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13429371c9d4SSatish Balay                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
13439371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1344d49cd2b7SBarry Smith #else
13459371c9d4SSatish Balay                               xarray);
13469371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1347afb2bd1cSJunchao Zhang #endif
1348aa372e3fSPaul Mullowney 
1349aa372e3fSPaul Mullowney   /* Then, solve U */
13509371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
13511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1352afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1353afb2bd1cSJunchao Zhang #endif
13549371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
13551b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13569371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
13579371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1358d49cd2b7SBarry Smith #else
13599371c9d4SSatish Balay                               tempGPU->data().get());
13609371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1361afb2bd1cSJunchao Zhang #endif
1362d49cd2b7SBarry Smith 
13634e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
13649371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
13659ae82921SPaul Mullowney 
13669566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13679566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13689566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13709ae82921SPaul Mullowney   PetscFunctionReturn(0);
13719ae82921SPaul Mullowney }
13729ae82921SPaul Mullowney 
13739371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1374465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1375465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
13769ae82921SPaul Mullowney   cusparseStatus_t                   stat;
13779ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1378aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1379aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1380aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13819ae82921SPaul Mullowney 
13829ae82921SPaul Mullowney   PetscFunctionBegin;
1383e057df02SPaul Mullowney   /* Get the GPU pointers */
13849566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13859566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
13869ae82921SPaul Mullowney 
13879566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1388aa372e3fSPaul Mullowney   /* First, solve L */
13899371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
13901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1391afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1392afb2bd1cSJunchao Zhang #endif
13939371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
13941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13959371c9d4SSatish Balay                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
13969371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1397d49cd2b7SBarry Smith #else
13989371c9d4SSatish Balay                               tempGPU->data().get());
13999371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1400afb2bd1cSJunchao Zhang #endif
1401d49cd2b7SBarry Smith 
1402aa372e3fSPaul Mullowney   /* Next, solve U */
14039371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
14041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1405afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1406afb2bd1cSJunchao Zhang #endif
14079371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1408d49cd2b7SBarry Smith                               tempGPU->data().get(),
14091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
14109371c9d4SSatish Balay                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
14119371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1412d49cd2b7SBarry Smith #else
14139371c9d4SSatish Balay                               xarray);
14149371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1415afb2bd1cSJunchao Zhang #endif
14169ae82921SPaul Mullowney 
14179566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
14189566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
14199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
14219ae82921SPaul Mullowney   PetscFunctionReturn(0);
14229ae82921SPaul Mullowney }
14239ae82921SPaul Mullowney 
1424da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1425da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
14269371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1427da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1428da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1429da112707SJunchao Zhang   const PetscScalar            *barray;
1430da112707SJunchao Zhang   PetscScalar                  *xarray;
1431da112707SJunchao Zhang 
1432da112707SJunchao Zhang   PetscFunctionBegin;
1433da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1434da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1435da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1436da112707SJunchao Zhang 
1437da112707SJunchao Zhang   /* Solve L*y = b */
1438da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1439da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14409371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
14419371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
144212ba2bc6SJunchao Zhang                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1443da112707SJunchao Zhang 
1444da112707SJunchao Zhang   /* Solve U*x = y */
1445da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14469371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
14479371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1448da112707SJunchao Zhang 
1449da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1450da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1451da112707SJunchao Zhang 
1452da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1453da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1454da112707SJunchao Zhang   PetscFunctionReturn(0);
1455da112707SJunchao Zhang }
1456da112707SJunchao Zhang 
14579371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x) {
1458da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1459da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1460da112707SJunchao Zhang   const PetscScalar            *barray;
1461da112707SJunchao Zhang   PetscScalar                  *xarray;
1462da112707SJunchao Zhang 
1463da112707SJunchao Zhang   PetscFunctionBegin;
146412ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1465da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
14669371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
14679371c9d4SSatish Balay                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1468da112707SJunchao Zhang 
1469da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
14709371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1471da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
147212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
147312ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
147412ba2bc6SJunchao Zhang   }
1475da112707SJunchao Zhang 
147612ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
14779371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1478da112707SJunchao Zhang 
14799371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
148012ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1481da112707SJunchao Zhang   }
1482da112707SJunchao Zhang 
1483da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1484da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1485da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1486da112707SJunchao Zhang 
1487da112707SJunchao Zhang   /* Solve Ut*y = b */
1488da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1489da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14909371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
14919371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1492da112707SJunchao Zhang 
1493da112707SJunchao Zhang   /* Solve Lt*x = y */
1494da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14959371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
14969371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1497da112707SJunchao Zhang 
1498da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1499da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1500da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1501da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1502da112707SJunchao Zhang   PetscFunctionReturn(0);
1503da112707SJunchao Zhang }
1504da112707SJunchao Zhang 
15059371c9d4SSatish Balay static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info) {
1506da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1507da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1508da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1509da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1510da112707SJunchao Zhang   PetscInt                      m, nz;
1511da112707SJunchao Zhang   PetscBool                     flg;
1512da112707SJunchao Zhang 
1513da112707SJunchao Zhang   PetscFunctionBegin;
1514da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1515da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1516da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1517da112707SJunchao Zhang   }
1518da112707SJunchao Zhang 
1519da112707SJunchao Zhang   /* Copy A's value to fact */
1520da112707SJunchao Zhang   m  = fact->rmap->n;
1521da112707SJunchao Zhang   nz = aij->nz;
1522da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1523da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1524da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1525da112707SJunchao Zhang 
1526da112707SJunchao Zhang   /* Factorize fact inplace */
15279371c9d4SSatish Balay   if (m)
15289371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
15299371c9d4SSatish Balay                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1530da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1531da112707SJunchao Zhang     int              numerical_zero;
1532da112707SJunchao Zhang     cusparseStatus_t status;
1533da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1534da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1535da112707SJunchao Zhang   }
1536da112707SJunchao Zhang 
153712ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
153812ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
153912ba2bc6SJunchao Zhang   */
15409371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1541da112707SJunchao Zhang 
15429371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1543da112707SJunchao Zhang 
154412ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
154512ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
154612ba2bc6SJunchao Zhang 
1547da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1548da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1549da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1550da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1551da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1552da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1553da112707SJunchao Zhang   PetscFunctionReturn(0);
1554da112707SJunchao Zhang }
1555da112707SJunchao Zhang 
15569371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1557da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1558da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1559da112707SJunchao Zhang   PetscInt                      m, nz;
1560da112707SJunchao Zhang 
1561da112707SJunchao Zhang   PetscFunctionBegin;
1562da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1563da112707SJunchao Zhang     PetscInt  i;
1564da112707SJunchao Zhang     PetscBool flg, missing;
1565da112707SJunchao Zhang 
1566da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1567da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1568da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1569da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1570da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1571da112707SJunchao Zhang   }
1572da112707SJunchao Zhang 
1573da112707SJunchao Zhang   /* Free the old stale stuff */
1574da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1575da112707SJunchao Zhang 
1576da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1577da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1578da112707SJunchao Zhang    */
1579da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1580da112707SJunchao Zhang 
1581da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1582da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1583da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1584da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1585da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1586da112707SJunchao Zhang 
1587da112707SJunchao Zhang   aij->row = NULL;
1588da112707SJunchao Zhang   aij->col = NULL;
1589da112707SJunchao Zhang 
1590da112707SJunchao Zhang   /* ====================================================================== */
1591da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1592da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1593da112707SJunchao Zhang   /* ====================================================================== */
1594da112707SJunchao Zhang   const int *Ai, *Aj;
1595da112707SJunchao Zhang 
1596da112707SJunchao Zhang   m  = fact->rmap->n;
1597da112707SJunchao Zhang   nz = aij->nz;
1598da112707SJunchao Zhang 
1599da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1600da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1601da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1602da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1603da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1604da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1605da112707SJunchao Zhang 
1606da112707SJunchao Zhang   /* ====================================================================== */
1607da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1608da112707SJunchao Zhang   /* ====================================================================== */
1609da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1610da112707SJunchao Zhang   cusparseDiagType_t diagType;
1611da112707SJunchao Zhang 
1612da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1613da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1614da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1615da112707SJunchao Zhang 
1616da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1617da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1618da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1619da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1620da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1621da112707SJunchao Zhang   */
1622da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1623da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
16249371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16269371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1627da112707SJunchao Zhang 
1628da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1629da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
16309371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16319371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16329371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1633da112707SJunchao Zhang 
1634da112707SJunchao Zhang   /* ========================================================================= */
1635da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1636da112707SJunchao Zhang   /* ========================================================================= */
1637da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
16389371c9d4SSatish Balay   if (m)
16399371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16409371c9d4SSatish Balay                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1641da112707SJunchao Zhang 
1642da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1643da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1644da112707SJunchao Zhang 
1645da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1646da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1647da112707SJunchao Zhang 
1648da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
16499371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1650da112707SJunchao Zhang 
1651da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
16529371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1653da112707SJunchao Zhang 
1654da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
165512ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
165612ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
165712ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1658da112707SJunchao Zhang    */
165912ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
166012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
166112ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1662da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
166312ba2bc6SJunchao Zhang   } else {
166412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
166512ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1666da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
166712ba2bc6SJunchao Zhang   }
1668da112707SJunchao Zhang 
1669da112707SJunchao Zhang   /* ========================================================================== */
1670da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1671da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1672da112707SJunchao Zhang   /* ========================================================================== */
1673da112707SJunchao Zhang   int              structural_zero;
1674da112707SJunchao Zhang   cusparseStatus_t status;
1675da112707SJunchao Zhang 
1676da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
16779371c9d4SSatish Balay   if (m)
16789371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16799371c9d4SSatish Balay                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1680da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1681da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1682da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1683da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1684da112707SJunchao Zhang   }
1685da112707SJunchao Zhang 
1686da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
16870dd8c0acSJunchao Zhang   {
1688da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
16890dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1690da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1691da112707SJunchao Zhang 
1692da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1693da112707SJunchao Zhang     Ai    = Aseq->i;
1694da112707SJunchao Zhang     Adiag = Aseq->diag;
1695da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1696da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1697da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1698da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1699da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1700da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1701da112707SJunchao Zhang         */
1702da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1703da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1704da112707SJunchao Zhang       }
1705da112707SJunchao Zhang     }
1706da112707SJunchao Zhang     fs->numericFactFlops = flops;
17070dd8c0acSJunchao Zhang   }
1708da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1709da112707SJunchao Zhang   PetscFunctionReturn(0);
1710da112707SJunchao Zhang }
1711da112707SJunchao Zhang 
17129371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) {
1713da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1714da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1715da112707SJunchao Zhang   const PetscScalar            *barray;
1716da112707SJunchao Zhang   PetscScalar                  *xarray;
1717da112707SJunchao Zhang 
1718da112707SJunchao Zhang   PetscFunctionBegin;
1719da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1720da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1721da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1722da112707SJunchao Zhang 
1723da112707SJunchao Zhang   /* Solve L*y = b */
1724da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1725da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
17269371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
17279371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1728da112707SJunchao Zhang 
1729da112707SJunchao Zhang   /* Solve Lt*x = y */
1730da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
17319371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
17329371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1733da112707SJunchao Zhang 
1734da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1735da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1736da112707SJunchao Zhang 
1737da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1738da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1739da112707SJunchao Zhang   PetscFunctionReturn(0);
1740da112707SJunchao Zhang }
1741da112707SJunchao Zhang 
17429371c9d4SSatish Balay static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info) {
1743da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1744da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1745da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1746da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1747da112707SJunchao Zhang   PetscInt                      m, nz;
1748da112707SJunchao Zhang   PetscBool                     flg;
1749da112707SJunchao Zhang 
1750da112707SJunchao Zhang   PetscFunctionBegin;
1751da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1752da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1753da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1754da112707SJunchao Zhang   }
1755da112707SJunchao Zhang 
1756da112707SJunchao Zhang   /* Copy A's value to fact */
1757da112707SJunchao Zhang   m  = fact->rmap->n;
1758da112707SJunchao Zhang   nz = aij->nz;
1759da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1760da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1761da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1762da112707SJunchao Zhang 
1763da112707SJunchao Zhang   /* Factorize fact inplace */
1764da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1765da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1766da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1767da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1768da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1769da112707SJunchao Zhang    */
17709371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1771da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1772da112707SJunchao Zhang     int              numerical_zero;
1773da112707SJunchao Zhang     cusparseStatus_t status;
1774da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1775da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1776da112707SJunchao Zhang   }
1777da112707SJunchao Zhang 
17789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1779da112707SJunchao Zhang 
1780da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1781da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1782da112707SJunchao Zhang   */
17839371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1784da112707SJunchao Zhang 
1785da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1786da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1787da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1788da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1789da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1790da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1791da112707SJunchao Zhang   PetscFunctionReturn(0);
1792da112707SJunchao Zhang }
1793da112707SJunchao Zhang 
17949371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1795da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1796da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1797da112707SJunchao Zhang   PetscInt                      m, nz;
1798da112707SJunchao Zhang 
1799da112707SJunchao Zhang   PetscFunctionBegin;
1800da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1801da112707SJunchao Zhang     PetscInt  i;
1802da112707SJunchao Zhang     PetscBool flg, missing;
1803da112707SJunchao Zhang 
1804da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1805da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1806da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1807da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1808da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1809da112707SJunchao Zhang   }
1810da112707SJunchao Zhang 
1811da112707SJunchao Zhang   /* Free the old stale stuff */
1812da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1813da112707SJunchao Zhang 
1814da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1815da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1816da112707SJunchao Zhang    */
1817da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1818da112707SJunchao Zhang 
1819da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1820da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
1821da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1822da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1823da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1824da112707SJunchao Zhang 
1825da112707SJunchao Zhang   aij->row = NULL;
1826da112707SJunchao Zhang   aij->col = NULL;
1827da112707SJunchao Zhang 
1828da112707SJunchao Zhang   /* ====================================================================== */
1829da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1830da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1831da112707SJunchao Zhang   /* ====================================================================== */
1832da112707SJunchao Zhang   const int *Ai, *Aj;
1833da112707SJunchao Zhang 
1834da112707SJunchao Zhang   m  = fact->rmap->n;
1835da112707SJunchao Zhang   nz = aij->nz;
1836da112707SJunchao Zhang 
1837da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1838da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1839da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1840da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1841da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1842da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1843da112707SJunchao Zhang 
1844da112707SJunchao Zhang   /* ====================================================================== */
1845da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
1846da112707SJunchao Zhang   /* ====================================================================== */
1847da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1848da112707SJunchao Zhang   cusparseDiagType_t diagType;
1849da112707SJunchao Zhang 
1850da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1851da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1852da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1853da112707SJunchao Zhang 
1854da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1855da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1856da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1857da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1858da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1859da112707SJunchao Zhang   */
1860da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1861da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
18629371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18639371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18649371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1865da112707SJunchao Zhang 
1866da112707SJunchao Zhang   /* ========================================================================= */
1867da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1868da112707SJunchao Zhang   /* ========================================================================= */
1869da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
18709371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1871da112707SJunchao Zhang 
1872da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1873da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1874da112707SJunchao Zhang 
1875da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1876da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1877da112707SJunchao Zhang 
1878da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18799371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1880da112707SJunchao Zhang 
1881da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
18829371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1883da112707SJunchao Zhang 
188412ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
188512ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
188612ba2bc6SJunchao Zhang    */
188712ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
188812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
188912ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1890da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
189112ba2bc6SJunchao Zhang   } else {
189212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
189312ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
189412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
189512ba2bc6SJunchao Zhang   }
1896da112707SJunchao Zhang 
1897da112707SJunchao Zhang   /* ========================================================================== */
1898da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
1899da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
1900da112707SJunchao Zhang   /* ========================================================================== */
1901da112707SJunchao Zhang   int              structural_zero;
1902da112707SJunchao Zhang   cusparseStatus_t status;
1903da112707SJunchao Zhang 
1904da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
19059371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1906da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1907da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1908da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1909da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1910da112707SJunchao Zhang   }
1911da112707SJunchao Zhang 
1912da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
19130dd8c0acSJunchao Zhang   {
1914da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
19150dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
1916da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1917da112707SJunchao Zhang 
1918da112707SJunchao Zhang     Ai = Aseq->i;
1919da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1920da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
1921da112707SJunchao Zhang       if (nzRow > 1) {
1922da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1923da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1924da112707SJunchao Zhang         */
1925da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1926da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1927da112707SJunchao Zhang       }
1928da112707SJunchao Zhang     }
1929da112707SJunchao Zhang     fs->numericFactFlops = flops;
19300dd8c0acSJunchao Zhang   }
1931da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1932da112707SJunchao Zhang   PetscFunctionReturn(0);
1933da112707SJunchao Zhang }
1934da112707SJunchao Zhang #endif
1935da112707SJunchao Zhang 
19369371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1937da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1938da112707SJunchao Zhang 
1939da112707SJunchao Zhang   PetscFunctionBegin;
1940da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1941bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1942bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
1943da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
1944da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
1945bc996fdcSJunchao Zhang   }
1946da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
1947da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1948da112707SJunchao Zhang   } else
1949da112707SJunchao Zhang #endif
1950da112707SJunchao Zhang   {
1951da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1952da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1953da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1954da112707SJunchao Zhang   }
1955da112707SJunchao Zhang   PetscFunctionReturn(0);
1956da112707SJunchao Zhang }
1957da112707SJunchao Zhang 
19589371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) {
1959da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1960da112707SJunchao Zhang 
1961da112707SJunchao Zhang   PetscFunctionBegin;
1962da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1963da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1964da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1965da112707SJunchao Zhang   PetscFunctionReturn(0);
1966da112707SJunchao Zhang }
1967da112707SJunchao Zhang 
19689371c9d4SSatish Balay static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1969da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1970da112707SJunchao Zhang 
1971da112707SJunchao Zhang   PetscFunctionBegin;
1972da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1973bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
1974bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1975da112707SJunchao Zhang   if (!info->levels && perm_identity) {
1976da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1977da112707SJunchao Zhang   } else
1978da112707SJunchao Zhang #endif
1979da112707SJunchao Zhang   {
1980da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1981da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1982da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1983da112707SJunchao Zhang   }
1984da112707SJunchao Zhang   PetscFunctionReturn(0);
1985da112707SJunchao Zhang }
1986da112707SJunchao Zhang 
19879371c9d4SSatish Balay static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
1988da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1989da112707SJunchao Zhang 
1990da112707SJunchao Zhang   PetscFunctionBegin;
1991da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1992da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1993da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1994da112707SJunchao Zhang   PetscFunctionReturn(0);
1995da112707SJunchao Zhang }
1996da112707SJunchao Zhang 
19979371c9d4SSatish Balay PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type) {
1998841d4cb1SJunchao Zhang   PetscFunctionBegin;
1999841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
2000841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2001841d4cb1SJunchao Zhang }
2002841d4cb1SJunchao Zhang 
2003841d4cb1SJunchao Zhang /*MC
2004841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2005*11a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2006841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2007841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2008*11a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2009841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2010841d4cb1SJunchao Zhang 
2011841d4cb1SJunchao Zhang   Level: beginner
2012841d4cb1SJunchao Zhang 
2013*11a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2014841d4cb1SJunchao Zhang M*/
2015841d4cb1SJunchao Zhang 
20169371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) {
2017841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
2018bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
2019bc996fdcSJunchao Zhang   char     *prefix;
2020bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
2021841d4cb1SJunchao Zhang 
2022841d4cb1SJunchao Zhang   PetscFunctionBegin;
2023841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2024841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2025841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
2026841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2027841d4cb1SJunchao Zhang 
2028bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2029bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2030bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2031bc996fdcSJunchao Zhang   PetscOptionsEnd();
2032bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2033bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2034bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2035bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2036bc996fdcSJunchao Zhang 
2037841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2038841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2039841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2040841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2041841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2042841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2043841d4cb1SJunchao Zhang     } else {
2044841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2045841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2046841d4cb1SJunchao Zhang     }
2047841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2048841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2049841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2050841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2051841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2052841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2053841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2054841d4cb1SJunchao Zhang     } else {
2055841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2056841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2057841d4cb1SJunchao Zhang     }
2058841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2059841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2060841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2061841d4cb1SJunchao Zhang 
2062841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2063841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2064841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2065841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2066841d4cb1SJunchao Zhang }
2067841d4cb1SJunchao Zhang 
20689371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) {
20697e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
20707e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
20710dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500
2072da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
20730dd8c0acSJunchao Zhang #endif
20747e8381f9SStefano Zampini 
20757e8381f9SStefano Zampini   PetscFunctionBegin;
20767e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
20779566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2078da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2079da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
20809566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2081da112707SJunchao Zhang     }
2082da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500
2083da112707SJunchao Zhang     else if (fs->csrVal) {
2084da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2085da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2086da112707SJunchao Zhang     }
2087da112707SJunchao Zhang #endif
20889371c9d4SSatish Balay     else
20899371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
20909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
20919566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
20927e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
20937e8381f9SStefano Zampini   }
20947e8381f9SStefano Zampini   PetscFunctionReturn(0);
20957e8381f9SStefano Zampini }
20967e8381f9SStefano Zampini 
20979371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
20987e8381f9SStefano Zampini   PetscFunctionBegin;
20999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
210067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
210167a45760SJunchao Zhang   PetscFunctionReturn(0);
210267a45760SJunchao Zhang }
210367a45760SJunchao Zhang 
21049371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
210567a45760SJunchao Zhang   PetscFunctionBegin;
21067e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
210767a45760SJunchao Zhang   *array         = NULL;
210867a45760SJunchao Zhang   PetscFunctionReturn(0);
210967a45760SJunchao Zhang }
211067a45760SJunchao Zhang 
21119371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
211267a45760SJunchao Zhang   PetscFunctionBegin;
21139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
211467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
211567a45760SJunchao Zhang   PetscFunctionReturn(0);
211667a45760SJunchao Zhang }
211767a45760SJunchao Zhang 
21189371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) {
211967a45760SJunchao Zhang   PetscFunctionBegin;
212067a45760SJunchao Zhang   *array = NULL;
212167a45760SJunchao Zhang   PetscFunctionReturn(0);
212267a45760SJunchao Zhang }
212367a45760SJunchao Zhang 
21249371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
212567a45760SJunchao Zhang   PetscFunctionBegin;
212667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
212767a45760SJunchao Zhang   PetscFunctionReturn(0);
212867a45760SJunchao Zhang }
212967a45760SJunchao Zhang 
21309371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) {
213167a45760SJunchao Zhang   PetscFunctionBegin;
213267a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
213367a45760SJunchao Zhang   *array         = NULL;
21347e8381f9SStefano Zampini   PetscFunctionReturn(0);
21357e8381f9SStefano Zampini }
21367e8381f9SStefano Zampini 
21379371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) {
21387ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
21397ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
21407ee59b9bSJunchao Zhang 
21417ee59b9bSJunchao Zhang   PetscFunctionBegin;
21427ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
21437ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
21447ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
21457ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
21467ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
21477ee59b9bSJunchao Zhang 
21487ee59b9bSJunchao Zhang   if (i) {
21497ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21507ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
21517ee59b9bSJunchao Zhang #else
21527ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21537ee59b9bSJunchao Zhang #endif
21547ee59b9bSJunchao Zhang   }
21557ee59b9bSJunchao Zhang   if (j) {
21567ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21577ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
21587ee59b9bSJunchao Zhang #else
21597ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21607ee59b9bSJunchao Zhang #endif
21617ee59b9bSJunchao Zhang   }
21627ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
21637ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
21647ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
21657ee59b9bSJunchao Zhang }
21667ee59b9bSJunchao Zhang 
21679371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) {
2168aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
21697c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
21709ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2171213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2172aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2173abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
21749ae82921SPaul Mullowney 
21759ae82921SPaul Mullowney   PetscFunctionBegin;
217628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2177c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2178a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2179a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2180afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
218185ba7357SStefano Zampini 
218208401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
21839566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2184afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
21859566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
21869566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
21879566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21889566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
218934d6c7a5SJose E. Roman     } else {
2190abb89eb1SStefano Zampini       PetscInt nnz;
21919566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21929566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
21939566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
21947c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
219581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2196a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2197a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
21989ae82921SPaul Mullowney       try {
21999ae82921SPaul Mullowney         if (a->compressedrow.use) {
22009ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
22019ae82921SPaul Mullowney           ii   = a->compressedrow.i;
22029ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
22039ae82921SPaul Mullowney         } else {
2204213423ffSJunchao Zhang           m    = A->rmap->n;
2205213423ffSJunchao Zhang           ii   = a->i;
2206e6e9a74fSStefano Zampini           ridx = NULL;
22079ae82921SPaul Mullowney         }
220808401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
22099371c9d4SSatish Balay         if (!a->a) {
22109371c9d4SSatish Balay           nnz  = ii[m];
22119371c9d4SSatish Balay           both = PETSC_FALSE;
22129371c9d4SSatish Balay         } else nnz = a->nz;
221308401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
22149ae82921SPaul Mullowney 
221585ba7357SStefano Zampini         /* create cusparse matrix */
2216abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2217aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
22189566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
22199566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
22209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
22219ae82921SPaul Mullowney 
22229566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
22239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
22249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
22259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22279566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2229b06137fdSPaul Mullowney 
2230aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2231aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2232aa372e3fSPaul Mullowney           /* set the matrix */
2233afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2234afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2235afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2236abb89eb1SStefano Zampini           mat->num_entries = nnz;
2237afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2238afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
22399ae82921SPaul Mullowney 
2240abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2241abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2242aa372e3fSPaul Mullowney 
2243abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2244abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2245aa372e3fSPaul Mullowney 
2246aa372e3fSPaul Mullowney           /* assign the pointer */
2247afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2248afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2249afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
22509371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
22519371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
22529371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2253afb2bd1cSJunchao Zhang           }
2254afb2bd1cSJunchao Zhang #endif
2255aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2256afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2257afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2258afb2bd1cSJunchao Zhang #else
2259afb2bd1cSJunchao Zhang           CsrMatrix *mat = new CsrMatrix;
2260afb2bd1cSJunchao Zhang           mat->num_rows = m;
2261afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2262abb89eb1SStefano Zampini           mat->num_entries = nnz;
2263afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2264afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2265aa372e3fSPaul Mullowney 
2266abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2267abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2268aa372e3fSPaul Mullowney 
2269abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2270abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2271aa372e3fSPaul Mullowney 
2272aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
22739566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
22749371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
22759371c9d4SSatish Balay           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
22769371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2277aa372e3fSPaul Mullowney           /* assign the pointer */
2278aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2279aa372e3fSPaul Mullowney 
2280afb2bd1cSJunchao Zhang           if (mat) {
2281afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2282afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2283afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2284afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2285087f3262SPaul Mullowney           }
2286afb2bd1cSJunchao Zhang #endif
2287087f3262SPaul Mullowney         }
2288ca45077fSPaul Mullowney 
2289aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2290213423ffSJunchao Zhang         if (a->compressedrow.use) {
2291213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2292aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2293aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2294213423ffSJunchao Zhang           tmp = m;
2295213423ffSJunchao Zhang         } else {
2296213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2297213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2298213423ffSJunchao Zhang           tmp                        = 0;
2299213423ffSJunchao Zhang         }
23009566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2301aa372e3fSPaul Mullowney 
2302aa372e3fSPaul Mullowney         /* assign the pointer */
2303aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
23049371c9d4SSatish Balay       } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
23059566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
23069566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
230734d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
230834d6c7a5SJose E. Roman     }
2309abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
23109ae82921SPaul Mullowney   }
23119ae82921SPaul Mullowney   PetscFunctionReturn(0);
23129ae82921SPaul Mullowney }
23139ae82921SPaul Mullowney 
23149371c9d4SSatish Balay struct VecCUDAPlusEquals {
2315aa372e3fSPaul Mullowney   template <typename Tuple>
23169371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
2317aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2318aa372e3fSPaul Mullowney   }
2319aa372e3fSPaul Mullowney };
2320aa372e3fSPaul Mullowney 
23219371c9d4SSatish Balay struct VecCUDAEquals {
23227e8381f9SStefano Zampini   template <typename Tuple>
23239371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
23247e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
23257e8381f9SStefano Zampini   }
23267e8381f9SStefano Zampini };
23277e8381f9SStefano Zampini 
23289371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2329e6e9a74fSStefano Zampini   template <typename Tuple>
23309371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
2331e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2332e6e9a74fSStefano Zampini   }
2333e6e9a74fSStefano Zampini };
2334e6e9a74fSStefano Zampini 
2335afb2bd1cSJunchao Zhang struct MatMatCusparse {
2336ccdfe979SStefano Zampini   PetscBool      cisdense;
2337ccdfe979SStefano Zampini   PetscScalar   *Bt;
2338ccdfe979SStefano Zampini   Mat            X;
2339fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2340fcdce8c4SStefano Zampini   PetscLogDouble flops;
2341fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2342b4285af6SJunchao Zhang 
2343afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2344fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2345afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2346afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2347afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2348afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2349b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2350b4285af6SJunchao Zhang   void *dBuffer4;
2351b4285af6SJunchao Zhang   void *dBuffer5;
2352b4285af6SJunchao Zhang #endif
2353fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2354fcdce8c4SStefano Zampini   void                 *mmBuffer;
2355fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2356fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2357afb2bd1cSJunchao Zhang #endif
2358afb2bd1cSJunchao Zhang };
2359ccdfe979SStefano Zampini 
23609371c9d4SSatish Balay static PetscErrorCode MatDestroy_MatMatCusparse(void *data) {
2361ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2362ccdfe979SStefano Zampini 
2363ccdfe979SStefano Zampini   PetscFunctionBegin;
23649566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2365fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2366afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
23679566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
23689566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
23699566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
23709566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2371b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
23729566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
23739566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2374b4285af6SJunchao Zhang #endif
23759566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
23769566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2377afb2bd1cSJunchao Zhang #endif
23789566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
23799566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2380ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2381ccdfe979SStefano Zampini }
2382ccdfe979SStefano Zampini 
2383ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2384ccdfe979SStefano Zampini 
23859371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2386ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2387ccdfe979SStefano Zampini   Mat                           A, B;
2388afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2389ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2390ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2391ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2392ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2393ccdfe979SStefano Zampini   const PetscScalar            *barray;
2394ccdfe979SStefano Zampini   PetscScalar                  *carray;
2395ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2396ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2397ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2398ccdfe979SStefano Zampini 
2399ccdfe979SStefano Zampini   PetscFunctionBegin;
2400ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
240128b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2402ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2403ccdfe979SStefano Zampini   A      = product->A;
2404ccdfe979SStefano Zampini   B      = product->B;
24059566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
240628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2407ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2408ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
240928b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
24109566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2411ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2412ccdfe979SStefano Zampini   switch (product->type) {
2413ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2414ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2415ccdfe979SStefano Zampini     mat = cusp->mat;
2416ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2417ccdfe979SStefano Zampini     m   = A->rmap->n;
2418ccdfe979SStefano Zampini     n   = B->cmap->n;
2419ccdfe979SStefano Zampini     break;
2420ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
24211a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2422e6e9a74fSStefano Zampini       mat = cusp->mat;
2423e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2424e6e9a74fSStefano Zampini     } else {
24259566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2426ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2427ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2428e6e9a74fSStefano Zampini     }
2429ccdfe979SStefano Zampini     m = A->cmap->n;
2430ccdfe979SStefano Zampini     n = B->cmap->n;
2431ccdfe979SStefano Zampini     break;
2432ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2433ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2434ccdfe979SStefano Zampini     mat = cusp->mat;
2435ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2436ccdfe979SStefano Zampini     m   = A->rmap->n;
2437ccdfe979SStefano Zampini     n   = B->rmap->n;
2438ccdfe979SStefano Zampini     break;
24399371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2440ccdfe979SStefano Zampini   }
244128b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2442ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2443ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
24449566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
24459566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
24469566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2447afb2bd1cSJunchao Zhang 
24489566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2449c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
24509566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
24519566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2452c8378d12SStefano Zampini   } else {
24539566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
24549566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2455c8378d12SStefano Zampini   }
2456c8378d12SStefano Zampini 
24579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2458afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2459afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2460a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2461afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2462fcdce8c4SStefano Zampini     size_t mmBufferSize;
24639371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
24649371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
24659371c9d4SSatish Balay       mmdata->matBDescr = NULL;
24669371c9d4SSatish Balay     }
2467afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
24689566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2469afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2470afb2bd1cSJunchao Zhang     }
2471c8378d12SStefano Zampini 
24729371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
24739371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
24749371c9d4SSatish Balay       mmdata->matCDescr = NULL;
24759371c9d4SSatish Balay     }
2476afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
24779566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2478afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2479afb2bd1cSJunchao Zhang     }
2480afb2bd1cSJunchao Zhang 
2481afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
24829371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
24839371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
24849371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2485afb2bd1cSJunchao Zhang     }
24869371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
24879371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2488fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
24899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
24909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2491fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2492fcdce8c4SStefano Zampini     }
2493afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2494afb2bd1cSJunchao Zhang   } else {
2495afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
24969566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
24979566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
24989566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2499afb2bd1cSJunchao Zhang   }
2500afb2bd1cSJunchao Zhang 
2501afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
25029371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
25039371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2504afb2bd1cSJunchao Zhang #else
2505afb2bd1cSJunchao Zhang   PetscInt k;
2506afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2507ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2508ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2509ccdfe979SStefano Zampini     cublasStatus_t cerr;
2510ccdfe979SStefano Zampini 
25119566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
25129371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
25139371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2514ccdfe979SStefano Zampini     blda = B->cmap->n;
2515afb2bd1cSJunchao Zhang     k = B->cmap->n;
2516afb2bd1cSJunchao Zhang   } else {
2517afb2bd1cSJunchao Zhang     k = B->rmap->n;
2518ccdfe979SStefano Zampini   }
2519ccdfe979SStefano Zampini 
2520afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
25219371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
25229371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2523afb2bd1cSJunchao Zhang #endif
25249566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
25259566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
25269566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2527ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
25289566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25299566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2530ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
25319566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25329566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2533ccdfe979SStefano Zampini   } else {
25349566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2535ccdfe979SStefano Zampini   }
253648a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
253748a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2538ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2539ccdfe979SStefano Zampini }
2540ccdfe979SStefano Zampini 
25419371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) {
2542ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2543ccdfe979SStefano Zampini   Mat                 A, B;
2544ccdfe979SStefano Zampini   PetscInt            m, n;
2545ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2546ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2547ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2548ccdfe979SStefano Zampini 
2549ccdfe979SStefano Zampini   PetscFunctionBegin;
2550ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
255128b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2552ccdfe979SStefano Zampini   A = product->A;
2553ccdfe979SStefano Zampini   B = product->B;
25549566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
255528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2556ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
255708401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2558ccdfe979SStefano Zampini   switch (product->type) {
2559ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2560ccdfe979SStefano Zampini     m = A->rmap->n;
2561ccdfe979SStefano Zampini     n = B->cmap->n;
2562ccdfe979SStefano Zampini     break;
2563ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2564ccdfe979SStefano Zampini     m = A->cmap->n;
2565ccdfe979SStefano Zampini     n = B->cmap->n;
2566ccdfe979SStefano Zampini     break;
2567ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2568ccdfe979SStefano Zampini     m = A->rmap->n;
2569ccdfe979SStefano Zampini     n = B->rmap->n;
2570ccdfe979SStefano Zampini     break;
2571ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2572ccdfe979SStefano Zampini     m = B->cmap->n;
2573ccdfe979SStefano Zampini     n = B->cmap->n;
2574ccdfe979SStefano Zampini     break;
2575ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2576ccdfe979SStefano Zampini     m = B->rmap->n;
2577ccdfe979SStefano Zampini     n = B->rmap->n;
2578ccdfe979SStefano Zampini     break;
25799371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2580ccdfe979SStefano Zampini   }
25819566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2582ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
25839566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
25849566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2585ccdfe979SStefano Zampini 
2586ccdfe979SStefano Zampini   /* product data */
25879566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2588ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2589afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2590afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
259148a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2592afb2bd1cSJunchao Zhang #endif
2593ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2594ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
25959566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
25969566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2597ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
25989566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2599ccdfe979SStefano Zampini     } else {
26009566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2601ccdfe979SStefano Zampini     }
2602ccdfe979SStefano Zampini   }
2603ccdfe979SStefano Zampini   C->product->data    = mmdata;
2604ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2605ccdfe979SStefano Zampini 
2606ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2607ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2608ccdfe979SStefano Zampini }
2609ccdfe979SStefano Zampini 
26109371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2611ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2612fcdce8c4SStefano Zampini   Mat                           A, B;
2613fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2614fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2615fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2616fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2617fcdce8c4SStefano Zampini   PetscBool                     flg;
2618fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2619fcdce8c4SStefano Zampini   MatProductType                ptype;
2620fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2621fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2622fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2623fcdce8c4SStefano Zampini #endif
2624b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2625ccdfe979SStefano Zampini 
2626ccdfe979SStefano Zampini   PetscFunctionBegin;
2627ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
262828b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
26299566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
263028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2631fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2632fcdce8c4SStefano Zampini   A      = product->A;
2633fcdce8c4SStefano Zampini   B      = product->B;
2634fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2635fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2636fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
263708401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2638fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
263928b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2640fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
264128b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2642fcdce8c4SStefano Zampini     goto finalize;
2643fcdce8c4SStefano Zampini   }
2644fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
26459566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
264628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
26479566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
264828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
264928b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
265028b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2651fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2652fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2653fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
265408401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
265508401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
265608401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
26579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
26589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2659fcdce8c4SStefano Zampini 
2660fcdce8c4SStefano Zampini   ptype = product->type;
2661b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2662fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
266328b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2664fa046f9fSJunchao Zhang   }
2665b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2666fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
266728b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2668fa046f9fSJunchao Zhang   }
2669fcdce8c4SStefano Zampini   switch (ptype) {
2670fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2671fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2672fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2673fcdce8c4SStefano Zampini     break;
2674fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2675fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2676fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2677fcdce8c4SStefano Zampini     break;
2678fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2679fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2680fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2681fcdce8c4SStefano Zampini     break;
26829371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2683fcdce8c4SStefano Zampini   }
2684fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
268528b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
268628b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
268728b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2688fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2689fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2690fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
269128b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
269228b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
269328b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
26949566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2695fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2696fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
26979566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2698b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26999371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
27009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2701b4285af6SJunchao Zhang #else
27029371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
27039371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
27049371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
27059371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2706b4285af6SJunchao Zhang #endif
2707fcdce8c4SStefano Zampini #else
27089371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
27099371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
27109371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2711fcdce8c4SStefano Zampini #endif
27129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
27139566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
27149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2715fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2716fcdce8c4SStefano Zampini finalize:
2717fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
27189566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
27199566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
27209566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2721fcdce8c4SStefano Zampini   c->reallocs = 0;
2722fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2723fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2724fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2725fcdce8c4SStefano Zampini   C->num_ass++;
2726ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2727ccdfe979SStefano Zampini }
2728fcdce8c4SStefano Zampini 
27299371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) {
2730fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2731fcdce8c4SStefano Zampini   Mat                           A, B;
2732fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2733fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2734fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2735fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2736fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
2737fcdce8c4SStefano Zampini   PetscBool                     flg;
2738fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2739fcdce8c4SStefano Zampini   MatProductType                ptype;
2740fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2741fcdce8c4SStefano Zampini   PetscLogDouble                flops;
2742fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
2743fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2744fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2745fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2746fcdce8c4SStefano Zampini #else
2747fcdce8c4SStefano Zampini   int cnz;
2748fcdce8c4SStefano Zampini #endif
2749b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2750fcdce8c4SStefano Zampini 
2751fcdce8c4SStefano Zampini   PetscFunctionBegin;
2752fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
275328b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2754fcdce8c4SStefano Zampini   A = product->A;
2755fcdce8c4SStefano Zampini   B = product->B;
27569566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
275728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
27589566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
275928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2760fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
2761fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
2762fcdce8c4SStefano Zampini   /* product data */
27639566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2764fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2765fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2766fcdce8c4SStefano Zampini 
27679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
27689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2769d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2770d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
277108401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
277208401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2773d60bce21SJunchao Zhang 
2774fcdce8c4SStefano Zampini   ptype = product->type;
2775b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2776fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2777fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2778fa046f9fSJunchao Zhang   }
2779b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2780fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2781fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2782fa046f9fSJunchao Zhang   }
2783fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2784fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2785fcdce8c4SStefano Zampini   switch (ptype) {
2786fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2787fcdce8c4SStefano Zampini     m    = A->rmap->n;
2788fcdce8c4SStefano Zampini     n    = B->cmap->n;
2789fcdce8c4SStefano Zampini     k    = A->cmap->n;
2790fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2791fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2792fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2793fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2794fcdce8c4SStefano Zampini     break;
2795fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2796fcdce8c4SStefano Zampini     m = A->cmap->n;
2797fcdce8c4SStefano Zampini     n = B->cmap->n;
2798fcdce8c4SStefano Zampini     k = A->rmap->n;
27999566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2800fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2801fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2802fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2803fcdce8c4SStefano Zampini     break;
2804fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2805fcdce8c4SStefano Zampini     m = A->rmap->n;
2806fcdce8c4SStefano Zampini     n = B->rmap->n;
2807fcdce8c4SStefano Zampini     k = A->cmap->n;
28089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2809fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2810fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2811fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2812fcdce8c4SStefano Zampini     break;
28139371c9d4SSatish Balay   default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2814fcdce8c4SStefano Zampini   }
2815fcdce8c4SStefano Zampini 
2816fcdce8c4SStefano Zampini   /* create cusparse matrix */
28179566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
28189566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2819fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
2820fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2821fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2822fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2823fcdce8c4SStefano Zampini 
2824fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2825fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2826fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
28279566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
28289566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2829fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2830fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2831fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2832fcdce8c4SStefano Zampini   } else {
2833fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2834fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2835fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2836fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2837fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2838fcdce8c4SStefano Zampini   }
2839fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2840fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
2841fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
2842fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2843fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2844fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
28459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
28469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
28479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
28489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
28499566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
28509566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
28519566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28529566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28539566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2854fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2855fcdce8c4SStefano Zampini     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2856fcdce8c4SStefano Zampini     c->nz                = 0;
2857fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2858fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
2859fcdce8c4SStefano Zampini     goto finalizesym;
2860fcdce8c4SStefano Zampini   }
2861fcdce8c4SStefano Zampini 
286228b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
286328b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2864fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2865fcdce8c4SStefano Zampini   if (!biscompressed) {
2866fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
2867fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2868fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2869fcdce8c4SStefano Zampini #endif
2870fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2871fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2872fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
2873fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2874fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2875fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2876fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2877fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2878fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2879fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2880fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
28819566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2882fcdce8c4SStefano Zampini     }
2883fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2884fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
2885fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2886fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
28879371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28889371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2889fcdce8c4SStefano Zampini     }
2890fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2891fcdce8c4SStefano Zampini #endif
2892fcdce8c4SStefano Zampini   }
289328b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
289428b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2895fcdce8c4SStefano Zampini   /* precompute flops count */
2896fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2897fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2898fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2899fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
2900fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
2901fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2902fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2903fcdce8c4SStefano Zampini       }
2904fcdce8c4SStefano Zampini     }
2905fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2906fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2907fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
2908fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2909fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
2910fcdce8c4SStefano Zampini     }
2911fcdce8c4SStefano Zampini   } else { /* TODO */
2912fcdce8c4SStefano Zampini     flops = 0.;
2913fcdce8c4SStefano Zampini   }
2914fcdce8c4SStefano Zampini 
2915fcdce8c4SStefano Zampini   mmdata->flops = flops;
29169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2917b4285af6SJunchao Zhang 
2918fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29199566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
29209371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
29219371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29229566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2923b4285af6SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2924b4285af6SJunchao Zhang   {
2925b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2926b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2927b4285af6SJunchao Zhang   */
2928b4285af6SJunchao Zhang     void  *dBuffer1    = NULL;
2929b4285af6SJunchao Zhang     void  *dBuffer2    = NULL;
2930b4285af6SJunchao Zhang     void  *dBuffer3    = NULL;
2931b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2932b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
2933b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
2934b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
2935b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
2936b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
2937b4285af6SJunchao Zhang 
2938b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2939b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
29409371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
29419371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2943b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
29449371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
29459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2946b4285af6SJunchao Zhang 
2947b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29489371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
29499371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
29519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
29529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
29539371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
29549371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
29569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
2957b4285af6SJunchao Zhang 
2958b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2959b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
29609566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2961b4285af6SJunchao Zhang     c->nz                = (PetscInt)C_nnz1;
2962b4285af6SJunchao Zhang     /* allocate matrix C */
29639371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29649371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29659371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
29669371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2967b4285af6SJunchao Zhang     /* update matC with the new pointers */
29689371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29699371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2970b4285af6SJunchao Zhang 
2971b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29729371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
29739371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29749566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
29759371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
29769371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
29789371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29799371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29809566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2981b4285af6SJunchao Zhang   }
2982ae37ee31SJunchao Zhang #else
2983b4285af6SJunchao Zhang   size_t bufSize2;
2984fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
29859371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
29869371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29879566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2988fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
29899371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
29909371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2991fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
29929371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
29939371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2994fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2995fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2996fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2997fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2998fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
29999566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3000fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
30019371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
30029371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3003fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
30049566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3005fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
30069371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
30079371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3008fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30099566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3010fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30119566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
30129371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
30139371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
30149371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30159371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3016ae37ee31SJunchao Zhang #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3017fcdce8c4SStefano Zampini #else
30189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
30199371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30209371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
30219371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3022fcdce8c4SStefano Zampini   c->nz = cnz;
3023fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30249566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3025fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30269566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3027fcdce8c4SStefano Zampini 
30289566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3029fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3030fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3031fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
30329371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30339371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30349371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3035fcdce8c4SStefano Zampini #endif
30369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3038fcdce8c4SStefano Zampini finalizesym:
3039fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3040fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3041fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
30429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
30439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3044fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3045fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3046fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3047fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3048fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3049fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3050fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3053fcdce8c4SStefano Zampini   } else {
3054fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3055fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3058fcdce8c4SStefano Zampini   }
3059fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3060fcdce8c4SStefano Zampini     PetscInt r = 0;
3061fcdce8c4SStefano Zampini     c->i[0]    = 0;
3062fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3063fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3064fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3065fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3066fcdce8c4SStefano Zampini     }
3067fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3068fcdce8c4SStefano Zampini   }
30699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
30709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
30719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3072fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3073fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3074fcdce8c4SStefano Zampini   c->rmax          = 0;
3075fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3076fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3077fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3078fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3079fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3080fcdce8c4SStefano Zampini   }
30819566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
30829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3083fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3084fcdce8c4SStefano Zampini 
3085fcdce8c4SStefano Zampini   C->nonzerostate++;
30869566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
30879566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3088fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3089fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3090fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3091fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3092fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3093abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3094fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3095fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3096fcdce8c4SStefano Zampini   }
3097fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3098fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
3099fcdce8c4SStefano Zampini }
3100fcdce8c4SStefano Zampini 
3101fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3102fcdce8c4SStefano Zampini 
3103fcdce8c4SStefano Zampini /* handles sparse or dense B */
31049371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) {
3105fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3106fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3107fcdce8c4SStefano Zampini 
3108fcdce8c4SStefano Zampini   PetscFunctionBegin;
3109fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
31109566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
311148a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3112fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3113fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
311448a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3115fcdce8c4SStefano Zampini   }
311665e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
311765e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
311865e4b4d4SStefano Zampini     switch (product->type) {
311965e4b4d4SStefano Zampini     case MATPRODUCT_AB:
312065e4b4d4SStefano Zampini       if (product->api_user) {
3121d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
31229566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3123d0609cedSBarry Smith         PetscOptionsEnd();
312465e4b4d4SStefano Zampini       } else {
3125d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
31269566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3127d0609cedSBarry Smith         PetscOptionsEnd();
312865e4b4d4SStefano Zampini       }
312965e4b4d4SStefano Zampini       break;
313065e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
313165e4b4d4SStefano Zampini       if (product->api_user) {
3132d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
31339566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3134d0609cedSBarry Smith         PetscOptionsEnd();
313565e4b4d4SStefano Zampini       } else {
3136d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
31379566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3138d0609cedSBarry Smith         PetscOptionsEnd();
313965e4b4d4SStefano Zampini       }
314065e4b4d4SStefano Zampini       break;
314165e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
314265e4b4d4SStefano Zampini       if (product->api_user) {
3143d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
31449566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3145d0609cedSBarry Smith         PetscOptionsEnd();
314665e4b4d4SStefano Zampini       } else {
3147d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
31489566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3149d0609cedSBarry Smith         PetscOptionsEnd();
315065e4b4d4SStefano Zampini       }
315165e4b4d4SStefano Zampini       break;
315265e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
315365e4b4d4SStefano Zampini       if (product->api_user) {
3154d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
31559566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3156d0609cedSBarry Smith         PetscOptionsEnd();
315765e4b4d4SStefano Zampini       } else {
3158d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
31599566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3160d0609cedSBarry Smith         PetscOptionsEnd();
316165e4b4d4SStefano Zampini       }
316265e4b4d4SStefano Zampini       break;
316365e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
316465e4b4d4SStefano Zampini       if (product->api_user) {
3165d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
31669566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3167d0609cedSBarry Smith         PetscOptionsEnd();
316865e4b4d4SStefano Zampini       } else {
3169d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
31709566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3171d0609cedSBarry Smith         PetscOptionsEnd();
317265e4b4d4SStefano Zampini       }
317365e4b4d4SStefano Zampini       break;
31749371c9d4SSatish Balay     default: break;
317565e4b4d4SStefano Zampini     }
317665e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
317765e4b4d4SStefano Zampini   }
317865e4b4d4SStefano Zampini   /* dispatch */
3179fcdce8c4SStefano Zampini   if (isdense) {
3180ccdfe979SStefano Zampini     switch (product->type) {
3181ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3182ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3183ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3184ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3185ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3186fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
31879566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3188fcdce8c4SStefano Zampini       } else {
3189fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3190fcdce8c4SStefano Zampini       }
3191fcdce8c4SStefano Zampini       break;
31929371c9d4SSatish Balay     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
31939371c9d4SSatish Balay     default: break;
3194ccdfe979SStefano Zampini     }
3195fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3196fcdce8c4SStefano Zampini     switch (product->type) {
3197fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3198fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
31999371c9d4SSatish Balay     case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; break;
3200fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3201fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
32029371c9d4SSatish Balay     case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break;
32039371c9d4SSatish Balay     default: break;
3204fcdce8c4SStefano Zampini     }
3205fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
32069566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3207fcdce8c4SStefano Zampini   }
3208ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3209ccdfe979SStefano Zampini }
3210ccdfe979SStefano Zampini 
32119371c9d4SSatish Balay static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
32129ae82921SPaul Mullowney   PetscFunctionBegin;
32139566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3214e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3215e6e9a74fSStefano Zampini }
3216e6e9a74fSStefano Zampini 
32179371c9d4SSatish Balay static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3218e6e9a74fSStefano Zampini   PetscFunctionBegin;
32199566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3220e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3221e6e9a74fSStefano Zampini }
3222e6e9a74fSStefano Zampini 
32239371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3224e6e9a74fSStefano Zampini   PetscFunctionBegin;
32259566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3226e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3227e6e9a74fSStefano Zampini }
3228e6e9a74fSStefano Zampini 
32299371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3230e6e9a74fSStefano Zampini   PetscFunctionBegin;
32319566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
32329ae82921SPaul Mullowney   PetscFunctionReturn(0);
32339ae82921SPaul Mullowney }
32349ae82921SPaul Mullowney 
32359371c9d4SSatish Balay static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
3236ca45077fSPaul Mullowney   PetscFunctionBegin;
32379566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3238ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3239ca45077fSPaul Mullowney }
3240ca45077fSPaul Mullowney 
32419371c9d4SSatish Balay __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) {
3242a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3243a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3244a0e72f99SJunchao Zhang }
3245a0e72f99SJunchao Zhang 
3246afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
32479371c9d4SSatish Balay static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) {
32489ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3249aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
32509ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3251e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3252e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3253e6e9a74fSStefano Zampini   PetscBool                     compressed;
3254afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3255afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3256afb2bd1cSJunchao Zhang #endif
32576e111a19SKarl Rupp 
32589ae82921SPaul Mullowney   PetscFunctionBegin;
325908401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3260cbc6b225SStefano Zampini   if (!a->nz) {
32619566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
32629566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3263e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3264e6e9a74fSStefano Zampini   }
326534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
32669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3267e6e9a74fSStefano Zampini   if (!trans) {
32689ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
32695f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3270e6e9a74fSStefano Zampini   } else {
32711a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3272e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3273e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3274e6e9a74fSStefano Zampini     } else {
32759566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3276e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3277e6e9a74fSStefano Zampini     }
3278e6e9a74fSStefano Zampini   }
3279e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3280e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3281213423ffSJunchao Zhang 
3282e6e9a74fSStefano Zampini   try {
32839566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
32849566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
32859566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3286afb2bd1cSJunchao Zhang 
32879566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3288e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3289afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3290afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3291afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3292afb2bd1cSJunchao Zhang       */
3293e6e9a74fSStefano Zampini       xptr = xarray;
3294afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3295213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3296afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3297afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3298afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3299afb2bd1cSJunchao Zhang        */
3300afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3301afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3302afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3303afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3304afb2bd1cSJunchao Zhang       }
3305afb2bd1cSJunchao Zhang #endif
3306e6e9a74fSStefano Zampini     } else {
3307afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3308afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3309afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3310afb2bd1cSJunchao Zhang        */
3311afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3312e6e9a74fSStefano Zampini       dptr = zarray;
3313e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3314afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3315e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3316a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
33179371c9d4SSatish Balay                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3318e6e9a74fSStefano Zampini       }
3319afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3320afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3321afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3322afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3323afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3324afb2bd1cSJunchao Zhang       }
3325afb2bd1cSJunchao Zhang #endif
3326e6e9a74fSStefano Zampini     }
33279ae82921SPaul Mullowney 
3328afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3329aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3330afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
33315f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3332afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
33339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
33349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
33359371c9d4SSatish Balay         PetscCallCUSPARSE(
33369371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
33379566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3338afb2bd1cSJunchao Zhang 
3339afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3340afb2bd1cSJunchao Zhang       } else {
3341afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
33429566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
33439566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3344afb2bd1cSJunchao Zhang       }
3345afb2bd1cSJunchao Zhang 
33469371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
33479371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3348afb2bd1cSJunchao Zhang #else
33497656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
33509371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3351afb2bd1cSJunchao Zhang #endif
3352aa372e3fSPaul Mullowney     } else {
3353213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3354afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3355afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3356afb2bd1cSJunchao Zhang #else
3357301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
33589371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3359afb2bd1cSJunchao Zhang #endif
3360a65300a6SPaul Mullowney       }
3361aa372e3fSPaul Mullowney     }
33629566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3363aa372e3fSPaul Mullowney 
3364e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3365213423ffSJunchao Zhang       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3366213423ffSJunchao Zhang         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
33679566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3368e6e9a74fSStefano Zampini         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
33699566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
33707656d835SStefano Zampini         }
3371213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
33729566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz, 0));
33737656d835SStefano Zampini       }
33747656d835SStefano Zampini 
3375213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3376213423ffSJunchao Zhang       if (compressed) {
33779566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3378a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3379a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3380a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3381a0e72f99SJunchao Zhang          */
3382a0e72f99SJunchao Zhang #if 0
3383a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3384a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3385a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3386e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3387c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3388a0e72f99SJunchao Zhang #else
3389a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3390a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3391a0e72f99SJunchao Zhang #endif
33929566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3393e6e9a74fSStefano Zampini       }
3394e6e9a74fSStefano Zampini     } else {
33959371c9d4SSatish Balay       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3396e6e9a74fSStefano Zampini     }
33979566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
33989566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
33999566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
34009371c9d4SSatish Balay   } catch (char *ex) { SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); }
3401e6e9a74fSStefano Zampini   if (yy) {
34029566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3403e6e9a74fSStefano Zampini   } else {
34049566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3405e6e9a74fSStefano Zampini   }
34069ae82921SPaul Mullowney   PetscFunctionReturn(0);
34079ae82921SPaul Mullowney }
34089ae82921SPaul Mullowney 
34099371c9d4SSatish Balay static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
3410ca45077fSPaul Mullowney   PetscFunctionBegin;
34119566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3412ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3413ca45077fSPaul Mullowney }
3414ca45077fSPaul Mullowney 
34159371c9d4SSatish Balay static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) {
3416042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3417042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
34183fa6b06aSMark Adams 
3419042217e8SBarry Smith   PetscFunctionBegin;
34209566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3421042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
34229566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
34239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3424042217e8SBarry Smith     cusp->deviceMat = NULL;
3425042217e8SBarry Smith   }
34269ae82921SPaul Mullowney   PetscFunctionReturn(0);
34279ae82921SPaul Mullowney }
34289ae82921SPaul Mullowney 
34299ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3430e057df02SPaul Mullowney /*@
3431*11a5261eSBarry Smith    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3432e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3433*11a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3434e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3435e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3436e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
34379ae82921SPaul Mullowney 
3438d083f849SBarry Smith    Collective
34399ae82921SPaul Mullowney 
34409ae82921SPaul Mullowney    Input Parameters:
3441*11a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
34429ae82921SPaul Mullowney .  m - number of rows
34439ae82921SPaul Mullowney .  n - number of columns
34449ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
34459ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
34460298fd71SBarry Smith          (possibly different for each row) or NULL
34479ae82921SPaul Mullowney 
34489ae82921SPaul Mullowney    Output Parameter:
34499ae82921SPaul Mullowney .  A - the matrix
34509ae82921SPaul Mullowney 
3451*11a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
34529ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
3453*11a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
34549ae82921SPaul Mullowney 
34559ae82921SPaul Mullowney    Notes:
34569ae82921SPaul Mullowney    If nnz is given then nz is ignored
34579ae82921SPaul Mullowney 
3458*11a5261eSBarry Smith    The AIJ format, also called
3459*11a5261eSBarry Smith    compressed row storage, is fully compatible with standard Fortran 77
34609ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
34619ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
34629ae82921SPaul Mullowney 
34639ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3464*11a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
34659ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
34669ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
34679ae82921SPaul Mullowney 
34689ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
34699ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
34709ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
34719ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
34729ae82921SPaul Mullowney 
34739ae82921SPaul Mullowney    Level: intermediate
34749ae82921SPaul Mullowney 
3475*11a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
34769ae82921SPaul Mullowney @*/
34779371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) {
34789ae82921SPaul Mullowney   PetscFunctionBegin;
34799566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
34809566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
34819566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
34829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
34839ae82921SPaul Mullowney   PetscFunctionReturn(0);
34849ae82921SPaul Mullowney }
34859ae82921SPaul Mullowney 
34869371c9d4SSatish Balay static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) {
34879ae82921SPaul Mullowney   PetscFunctionBegin;
34889ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
34899566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
34909ae82921SPaul Mullowney   } else {
34919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3492aa372e3fSPaul Mullowney   }
34939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
34949566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
34959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
34969566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
34979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
34989566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
34999566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
35009566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
35019566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
35029566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
35039566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
35049ae82921SPaul Mullowney   PetscFunctionReturn(0);
35059ae82921SPaul Mullowney }
35069ae82921SPaul Mullowney 
3507ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
350895639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
35099371c9d4SSatish Balay static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) {
35109ff858a8SKarl Rupp         PetscFunctionBegin;
35119566063dSJacob Faibussowitsch         PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
35129566063dSJacob Faibussowitsch         PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
35139ff858a8SKarl Rupp         PetscFunctionReturn(0);
35149ff858a8SKarl Rupp }
35159ff858a8SKarl Rupp 
35169371c9d4SSatish Balay static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) {
3517a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3518039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3519039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3520039c6fbaSStefano Zampini   PetscScalar        *ay;
3521039c6fbaSStefano Zampini   const PetscScalar  *ax;
3522039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3523e6e9a74fSStefano Zampini 
352495639643SRichard Tran Mills   PetscFunctionBegin;
3525a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3526a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3527039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
35289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35299566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3530a587d139SMark     PetscFunctionReturn(0);
353195639643SRichard Tran Mills   }
3532039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
35339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
35349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
35355f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
35365f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3537039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3538039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3539039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3540039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3541039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3542ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3543039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3544039c6fbaSStefano Zampini   }
3545d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3546d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3547039c6fbaSStefano Zampini 
3548039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3549039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3550039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3551039c6fbaSStefano Zampini     size_t bufferSize;
3552039c6fbaSStefano Zampini     void  *buffer;
3553039c6fbaSStefano Zampini #endif
3554039c6fbaSStefano Zampini 
35559566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35569566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35579566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3558039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
35599371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35609371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
35619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
35629566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35639371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35649371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
35659566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35669566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3568039c6fbaSStefano Zampini #else
35699566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35709371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35719371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
35729566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35739566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3574039c6fbaSStefano Zampini #endif
35759566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
35769566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35779566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3579039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3580a587d139SMark     cublasHandle_t cublasv2handle;
3581a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3582039c6fbaSStefano Zampini 
35839566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35849566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35859566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
35869566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
35879566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35889566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
35899566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
35909566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35929566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3594039c6fbaSStefano Zampini   } else {
35959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35969566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3597a587d139SMark   }
359895639643SRichard Tran Mills   PetscFunctionReturn(0);
359995639643SRichard Tran Mills }
360095639643SRichard Tran Mills 
36019371c9d4SSatish Balay static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) {
360233c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
360333c9ba73SStefano Zampini   PetscScalar   *ay;
360433c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
360533c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
360633c9ba73SStefano Zampini 
360733c9ba73SStefano Zampini   PetscFunctionBegin;
36089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
36099566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
36109566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
36119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
36129566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
36139566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
36149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
36159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36169566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
361733c9ba73SStefano Zampini   PetscFunctionReturn(0);
361833c9ba73SStefano Zampini }
361933c9ba73SStefano Zampini 
36209371c9d4SSatish Balay static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) {
36217e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3622a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
36237e8381f9SStefano Zampini 
36243fa6b06aSMark Adams   PetscFunctionBegin;
36253fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
36263fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
36277e8381f9SStefano Zampini     if (spptr->mat) {
36287e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
36297e8381f9SStefano Zampini       if (matrix->values) {
36307e8381f9SStefano Zampini         both = PETSC_TRUE;
36317e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36327e8381f9SStefano Zampini       }
36337e8381f9SStefano Zampini     }
36347e8381f9SStefano Zampini     if (spptr->matTranspose) {
36357e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3636ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36377e8381f9SStefano Zampini     }
36383fa6b06aSMark Adams   }
36399566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
36409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
36417e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3642a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
36433fa6b06aSMark Adams   PetscFunctionReturn(0);
36443fa6b06aSMark Adams }
36453fa6b06aSMark Adams 
36469371c9d4SSatish Balay static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) {
3647a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3648a587d139SMark 
3649a587d139SMark   PetscFunctionBegin;
36509a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
36519a14fc28SStefano Zampini     A->boundtocpu = flg;
36529a14fc28SStefano Zampini     PetscFunctionReturn(0);
36539a14fc28SStefano Zampini   }
3654a587d139SMark   if (flg) {
36559566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3656a587d139SMark 
365733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3658a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3659a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3660a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3661a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3662a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3663a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3664a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3665a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3666fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
36679566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
36689566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
36699566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
36709566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
36719566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
36729566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
36739566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3674a587d139SMark   } else {
367533c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3676a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3677a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3678a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3679a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3680a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3681a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3682a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3683a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3684fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
368567a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
368667a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
368767a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
368867a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
368967a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
369067a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
36917ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
36927ee59b9bSJunchao Zhang 
36939566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
36949566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36959566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36969566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
36979566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
36989566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3699a587d139SMark   }
3700a587d139SMark   A->boundtocpu = flg;
3701ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3702ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3703ea500dcfSRichard Tran Mills   } else {
3704ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3705ea500dcfSRichard Tran Mills   }
3706a587d139SMark   PetscFunctionReturn(0);
3707a587d139SMark }
3708a587d139SMark 
37099371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) {
371049735bf3SStefano Zampini   Mat B;
37119ae82921SPaul Mullowney 
37129ae82921SPaul Mullowney   PetscFunctionBegin;
37139566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
371449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
37159566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
371649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
37179566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
371849735bf3SStefano Zampini   }
371949735bf3SStefano Zampini   B = *newmat;
372049735bf3SStefano Zampini 
37219566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
37229566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
372334136279SStefano Zampini 
372449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
37259ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3726e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
37279566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37289566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37299566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
37301a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
3731d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3732ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
3733a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3734a435da06SStefano Zampini #else
3735d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3736a435da06SStefano Zampini #endif
3737d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3738d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3739d8132acaSStefano Zampini #endif
37401a2c6b5cSJunchao Zhang       B->spptr = spptr;
37419ae82921SPaul Mullowney     } else {
3742e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3743e6e9a74fSStefano Zampini 
37449566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37459566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37469566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3747e6e9a74fSStefano Zampini       B->spptr = spptr;
37489ae82921SPaul Mullowney     }
3749e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
375049735bf3SStefano Zampini   }
3751693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
37529ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
37531a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
37549ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
375595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3756693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
37572205254eSKarl Rupp 
37589566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
37599566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
37609566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3761ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
37629566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3763ae48a8d0SStefano Zampini #endif
37649566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
37659ae82921SPaul Mullowney   PetscFunctionReturn(0);
37669ae82921SPaul Mullowney }
37679ae82921SPaul Mullowney 
37689371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) {
376902fe1965SBarry Smith   PetscFunctionBegin;
37709566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
37719566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
377202fe1965SBarry Smith   PetscFunctionReturn(0);
377302fe1965SBarry Smith }
377402fe1965SBarry Smith 
37753ca39a21SBarry Smith /*MC
3776e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3777e057df02SPaul Mullowney 
3778*11a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
3779*11a5261eSBarry Smith    CSR, ELL, or Hybrid format.
3780*11a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3781e057df02SPaul Mullowney 
3782e057df02SPaul Mullowney    Options Database Keys:
3783*11a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
3784*11a5261eSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3785*11a5261eSBarry Smith -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
3786*11a5261eSBarry Smith +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3787e057df02SPaul Mullowney 
3788e057df02SPaul Mullowney   Level: beginner
3789e057df02SPaul Mullowney 
3790*11a5261eSBarry Smith .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3791e057df02SPaul Mullowney M*/
37927f756511SDominic Meiser 
3793bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
37940f39cd5aSBarry Smith 
37959371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) {
379642c9c57cSBarry Smith   PetscFunctionBegin;
37979566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
37989566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
37999566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
38009566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
38019566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3802bddcd29dSMark Adams 
380342c9c57cSBarry Smith   PetscFunctionReturn(0);
380442c9c57cSBarry Smith }
380529b38603SBarry Smith 
38069371c9d4SSatish Balay static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat) {
3807cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3808cbc6b225SStefano Zampini 
3809cbc6b225SStefano Zampini   PetscFunctionBegin;
3810cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3811cbc6b225SStefano Zampini   delete cusp->cooPerm;
3812cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3813cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
3814cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3815cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
38169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
38179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3818cbc6b225SStefano Zampini   }
3819cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3820cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3821cbc6b225SStefano Zampini }
3822cbc6b225SStefano Zampini 
38239371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) {
38247f756511SDominic Meiser   PetscFunctionBegin;
38257f756511SDominic Meiser   if (*cusparsestruct) {
38269566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
38279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
38287f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
382981902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
38307e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
38317e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3832a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
38339566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
38349566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
38359566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
38369566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
38377f756511SDominic Meiser   }
38387f756511SDominic Meiser   PetscFunctionReturn(0);
38397f756511SDominic Meiser }
38407f756511SDominic Meiser 
38419371c9d4SSatish Balay static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) {
38427f756511SDominic Meiser   PetscFunctionBegin;
38437f756511SDominic Meiser   if (*mat) {
38447f756511SDominic Meiser     delete (*mat)->values;
38457f756511SDominic Meiser     delete (*mat)->column_indices;
38467f756511SDominic Meiser     delete (*mat)->row_offsets;
38477f756511SDominic Meiser     delete *mat;
38487f756511SDominic Meiser     *mat = 0;
38497f756511SDominic Meiser   }
38507f756511SDominic Meiser   PetscFunctionReturn(0);
38517f756511SDominic Meiser }
38527f756511SDominic Meiser 
38539371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) {
38547f756511SDominic Meiser   PetscFunctionBegin;
38557f756511SDominic Meiser   if (*trifactor) {
38569566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3857261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
38589566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
38599566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
38609566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3861afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38629566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3863afb2bd1cSJunchao Zhang #endif
38649566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
38657f756511SDominic Meiser   }
38667f756511SDominic Meiser   PetscFunctionReturn(0);
38677f756511SDominic Meiser }
38687f756511SDominic Meiser 
38699371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) {
38707f756511SDominic Meiser   CsrMatrix *mat;
38717f756511SDominic Meiser 
38727f756511SDominic Meiser   PetscFunctionBegin;
38737f756511SDominic Meiser   if (*matstruct) {
38747f756511SDominic Meiser     if ((*matstruct)->mat) {
38757f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3876afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3877afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3878afb2bd1cSJunchao Zhang #else
38797f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
38809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3881afb2bd1cSJunchao Zhang #endif
38827f756511SDominic Meiser       } else {
38837f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
38847f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
38857f756511SDominic Meiser       }
38867f756511SDominic Meiser     }
38879566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
38887f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
38899566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
38909566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
38919566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3892afb2bd1cSJunchao Zhang 
3893afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3894afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
38959566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3896afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
3897afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
38989566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
38999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
39009566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3901afb2bd1cSJunchao Zhang       }
3902afb2bd1cSJunchao Zhang     }
3903afb2bd1cSJunchao Zhang #endif
39047f756511SDominic Meiser     delete *matstruct;
39057e8381f9SStefano Zampini     *matstruct = NULL;
39067f756511SDominic Meiser   }
39077f756511SDominic Meiser   PetscFunctionReturn(0);
39087f756511SDominic Meiser }
39097f756511SDominic Meiser 
39109371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) {
3911da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3912da112707SJunchao Zhang 
39137f756511SDominic Meiser   PetscFunctionBegin;
3914da112707SJunchao Zhang   if (fs) {
3915da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3916da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3917da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3918da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3919da112707SJunchao Zhang     delete fs->rpermIndices;
3920da112707SJunchao Zhang     delete fs->cpermIndices;
3921da112707SJunchao Zhang     delete fs->workVector;
3922da112707SJunchao Zhang     fs->rpermIndices = NULL;
3923da112707SJunchao Zhang     fs->cpermIndices = NULL;
3924da112707SJunchao Zhang     fs->workVector   = NULL;
3925da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3926da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3927da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
3928da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
3929da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3930da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
3931da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
3932da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
3933da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
393412ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3935da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3936da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
393712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3938da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3939da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3940da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3941da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3942da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3943da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3944da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3945da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3946da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3947da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3948da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3949da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
395012ba2bc6SJunchao Zhang 
395112ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
395212ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3953da112707SJunchao Zhang #endif
3954ccdfe979SStefano Zampini   }
3955ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3956ccdfe979SStefano Zampini }
3957ccdfe979SStefano Zampini 
39589371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) {
3959ccdfe979SStefano Zampini   cusparseHandle_t handle;
3960ccdfe979SStefano Zampini 
3961ccdfe979SStefano Zampini   PetscFunctionBegin;
3962ccdfe979SStefano Zampini   if (*trifactors) {
39639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
396448a46eb9SPierre Jolivet     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
39659566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
39667f756511SDominic Meiser   }
39677f756511SDominic Meiser   PetscFunctionReturn(0);
39687f756511SDominic Meiser }
39697e8381f9SStefano Zampini 
39709371c9d4SSatish Balay struct IJCompare {
39719371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
39727e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
39737e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
39747e8381f9SStefano Zampini     return false;
39757e8381f9SStefano Zampini   }
39767e8381f9SStefano Zampini };
39777e8381f9SStefano Zampini 
39789371c9d4SSatish Balay struct IJEqual {
39799371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) {
39807e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
39817e8381f9SStefano Zampini     return true;
39827e8381f9SStefano Zampini   }
39837e8381f9SStefano Zampini };
39847e8381f9SStefano Zampini 
39859371c9d4SSatish Balay struct IJDiff {
39869371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
39877e8381f9SStefano Zampini };
39887e8381f9SStefano Zampini 
39899371c9d4SSatish Balay struct IJSum {
39909371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
39917e8381f9SStefano Zampini };
39927e8381f9SStefano Zampini 
39937e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3994219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
39959371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
39967e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
3997fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
3998bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
399908391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
40007e8381f9SStefano Zampini   CsrMatrix                            *matrix;
40017e8381f9SStefano Zampini   PetscInt                              n;
40027e8381f9SStefano Zampini 
40037e8381f9SStefano Zampini   PetscFunctionBegin;
400428b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
400528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
40067e8381f9SStefano Zampini   if (!cusp->cooPerm) {
40079566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
40089566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
40097e8381f9SStefano Zampini     PetscFunctionReturn(0);
40107e8381f9SStefano Zampini   }
40117e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
401228b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4013e61fc153SStefano Zampini   if (!v) {
4014e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4015e61fc153SStefano Zampini     goto finalize;
40167e8381f9SStefano Zampini   }
4017e61fc153SStefano Zampini   n = cusp->cooPerm->size();
401808391a17SStefano Zampini   if (isCudaMem(v)) {
401908391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
402008391a17SStefano Zampini   } else {
4021e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4022e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
402308391a17SStefano Zampini     d_v = cooPerm_v->data();
40249566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
402508391a17SStefano Zampini   }
40269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4027e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4028ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4029bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
403008391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4031ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4032ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4033ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4034ddea5d60SJunchao Zhang       */
4035e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4036e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4037e61fc153SStefano Zampini       delete cooPerm_w;
40387e8381f9SStefano Zampini     } else {
4039ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
40409371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40419371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4042ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
40437e8381f9SStefano Zampini     }
40447e8381f9SStefano Zampini   } else {
4045e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
404608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4047e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
40487e8381f9SStefano Zampini     } else {
40499371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40509371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
40517e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
40527e8381f9SStefano Zampini     }
40537e8381f9SStefano Zampini   }
40549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4055e61fc153SStefano Zampini finalize:
4056e61fc153SStefano Zampini   delete cooPerm_v;
40577e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
40589566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4059fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
40609566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
40619566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
40629566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4063fcdce8c4SStefano Zampini   a->reallocs = 0;
4064fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4065fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4066fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4067fcdce8c4SStefano Zampini   A->num_ass++;
40687e8381f9SStefano Zampini   PetscFunctionReturn(0);
40697e8381f9SStefano Zampini }
40707e8381f9SStefano Zampini 
40719371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) {
4072a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4073a49f1ed0SStefano Zampini 
4074a49f1ed0SStefano Zampini   PetscFunctionBegin;
4075a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4076a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4077a49f1ed0SStefano Zampini   if (destroy) {
40789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4079a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4080a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4081a49f1ed0SStefano Zampini   }
40821a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
4083a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
4084a49f1ed0SStefano Zampini }
4085a49f1ed0SStefano Zampini 
40867e8381f9SStefano Zampini #include <thrust/binary_search.h>
4087219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
40889371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
40897e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
40907e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
40917e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
40927e8381f9SStefano Zampini 
40937e8381f9SStefano Zampini   PetscFunctionBegin;
40949566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
40959566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
40967e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
40977e8381f9SStefano Zampini   if (n != cooPerm_n) {
40987e8381f9SStefano Zampini     delete cusp->cooPerm;
40997e8381f9SStefano Zampini     delete cusp->cooPerm_a;
41007e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
41017e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
41027e8381f9SStefano Zampini   }
41037e8381f9SStefano Zampini   if (n) {
4104e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4105e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4106e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4107e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4108e8729f6fSJunchao Zhang 
4109e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4110e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4111e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4112e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4113e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4114e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4115e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4116e8729f6fSJunchao Zhang     } else {
4117e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4118e8729f6fSJunchao Zhang     }
4119e8729f6fSJunchao Zhang 
4120e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4121e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4122e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4123e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4124e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4125e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4126e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4127e8729f6fSJunchao Zhang     } else {
4128e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4129e8729f6fSJunchao Zhang     }
4130e8729f6fSJunchao Zhang 
41317e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
41327e8381f9SStefano Zampini 
4133ad540459SPierre Jolivet     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4134ad540459SPierre Jolivet     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
41357e8381f9SStefano Zampini 
4136ddea5d60SJunchao Zhang     /* Ex.
4137ddea5d60SJunchao Zhang       n = 6
4138ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4139ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4140ddea5d60SJunchao Zhang     */
4141e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4142e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
41437e8381f9SStefano Zampini 
41449566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41457e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4146ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4147e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4148e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
41497e8381f9SStefano Zampini 
4150ddea5d60SJunchao Zhang     /*
4151ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4152ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4153ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4154ddea5d60SJunchao Zhang     */
4155ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4156ddea5d60SJunchao Zhang 
4157ddea5d60SJunchao Zhang     /*
4158ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4159ddea5d60SJunchao Zhang                             ^ekey
4160ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4161ddea5d60SJunchao Zhang                            ^nekye
4162ddea5d60SJunchao Zhang     */
41637e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
41647e8381f9SStefano Zampini       delete cusp->cooPerm_a;
41657e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4166ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4167ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4168ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4169ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4170ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
41717e8381f9SStefano Zampini       w[0]                  = 0;
4172ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4173ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
41747e8381f9SStefano Zampini     }
41757e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4176e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4177ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4178ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
41799566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41807e8381f9SStefano Zampini 
41819566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
41827e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
41837e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
41847e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
41859566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4186ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
41879566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
41887e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4189fcdce8c4SStefano Zampini     a->rmax          = 0;
41909566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
41919566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4192e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
41939566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
41949566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
41957e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
41967e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
41977e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
41987e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4199fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
42007e8381f9SStefano Zampini     }
4201fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
42027e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
42039566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
42049566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4205e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4206e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
42077e8381f9SStefano Zampini   } else {
42089566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
42097e8381f9SStefano Zampini   }
42109566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
42117e8381f9SStefano Zampini 
42127e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4213e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
42149566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
42159566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
42167e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
42179566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
42189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
42197e8381f9SStefano Zampini   PetscFunctionReturn(0);
42207e8381f9SStefano Zampini }
4221ed502f03SStefano Zampini 
42229371c9d4SSatish Balay PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
4223219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4224219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4225cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4226219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4227219fbbafSJunchao Zhang 
4228219fbbafSJunchao Zhang   PetscFunctionBegin;
42299566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
42309566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4231219fbbafSJunchao Zhang   if (coo_i) {
42329566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4233219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4234219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
42359371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
42369371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
42379371c9d4SSatish Balay           break;
42389371c9d4SSatish Balay         }
4239219fbbafSJunchao Zhang       }
4240219fbbafSJunchao Zhang     }
4241219fbbafSJunchao Zhang   }
4242219fbbafSJunchao Zhang 
4243219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
42449566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4245219fbbafSJunchao Zhang   } else {
42469566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4247cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
42489566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4249219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4250219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
42519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
42529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
42539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
42549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4255219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4256219fbbafSJunchao Zhang   }
4257219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4258219fbbafSJunchao Zhang }
4259219fbbafSJunchao Zhang 
42609371c9d4SSatish Balay __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) {
4261219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4262219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4263b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4264b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4265b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4266b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4267b6c38306SJunchao Zhang   }
4268219fbbafSJunchao Zhang }
4269219fbbafSJunchao Zhang 
42709371c9d4SSatish Balay PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) {
4271219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4272219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4273219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4274219fbbafSJunchao Zhang   PetscMemType        memtype;
4275219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4276219fbbafSJunchao Zhang   PetscScalar        *Aa;
4277219fbbafSJunchao Zhang 
4278219fbbafSJunchao Zhang   PetscFunctionBegin;
4279219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
42809566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4281219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
42829566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
42839566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4284219fbbafSJunchao Zhang     }
4285219fbbafSJunchao Zhang 
42869566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
42879566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4288219fbbafSJunchao Zhang 
4289cbc6b225SStefano Zampini     if (Annz) {
4290b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
42919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4292cbc6b225SStefano Zampini     }
4293219fbbafSJunchao Zhang 
42949566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
42959566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4296219fbbafSJunchao Zhang 
42979566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4298219fbbafSJunchao Zhang   } else {
42999566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4300219fbbafSJunchao Zhang   }
4301219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4302219fbbafSJunchao Zhang }
4303219fbbafSJunchao Zhang 
43045b7e41feSStefano Zampini /*@C
4305*11a5261eSBarry Smith     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
43065b7e41feSStefano Zampini 
43075b7e41feSStefano Zampini    Not collective
43085b7e41feSStefano Zampini 
43095b7e41feSStefano Zampini     Input Parameters:
43105b7e41feSStefano Zampini +   A - the matrix
4311*11a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43125b7e41feSStefano Zampini 
43135b7e41feSStefano Zampini     Output Parameters:
43145b7e41feSStefano Zampini +   ia - the CSR row pointers
43155b7e41feSStefano Zampini -   ja - the CSR column indices
43165b7e41feSStefano Zampini 
43175b7e41feSStefano Zampini     Level: developer
43185b7e41feSStefano Zampini 
4319*11a5261eSBarry Smith     Note:
43205b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
43215b7e41feSStefano Zampini 
4322db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
43235b7e41feSStefano Zampini @*/
43249371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
43255f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43265f101d05SStefano Zampini   CsrMatrix          *csr;
43275f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
43285f101d05SStefano Zampini 
43295f101d05SStefano Zampini   PetscFunctionBegin;
43305f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43315f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
43325f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4333aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
43349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
433528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
43365f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
43375f101d05SStefano Zampini   if (i) {
43385f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
43395f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
43405f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
43415f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
43429566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
43435f101d05SStefano Zampini       }
43445f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
43455f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
43465f101d05SStefano Zampini   }
43475f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
43485f101d05SStefano Zampini   PetscFunctionReturn(0);
43495f101d05SStefano Zampini }
43505f101d05SStefano Zampini 
43515b7e41feSStefano Zampini /*@C
4352*11a5261eSBarry Smith     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
43535b7e41feSStefano Zampini 
43545b7e41feSStefano Zampini    Not collective
43555b7e41feSStefano Zampini 
43565b7e41feSStefano Zampini     Input Parameters:
43575b7e41feSStefano Zampini +   A - the matrix
4358*11a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43595b7e41feSStefano Zampini 
43605b7e41feSStefano Zampini     Output Parameters:
43615b7e41feSStefano Zampini +   ia - the CSR row pointers
43625b7e41feSStefano Zampini -   ja - the CSR column indices
43635b7e41feSStefano Zampini 
43645b7e41feSStefano Zampini     Level: developer
43655b7e41feSStefano Zampini 
4366db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
43675b7e41feSStefano Zampini @*/
43689371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) {
43695f101d05SStefano Zampini   PetscFunctionBegin;
43705f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43715f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43725f101d05SStefano Zampini   if (i) *i = NULL;
43735f101d05SStefano Zampini   if (j) *j = NULL;
43745f101d05SStefano Zampini   PetscFunctionReturn(0);
43755f101d05SStefano Zampini }
43765f101d05SStefano Zampini 
43775b7e41feSStefano Zampini /*@C
4378*11a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
43795b7e41feSStefano Zampini 
43805b7e41feSStefano Zampini    Not Collective
43815b7e41feSStefano Zampini 
43825b7e41feSStefano Zampini    Input Parameter:
4383*11a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
43845b7e41feSStefano Zampini 
43855b7e41feSStefano Zampini    Output Parameter:
43865b7e41feSStefano Zampini .   a - pointer to the device data
43875b7e41feSStefano Zampini 
43885b7e41feSStefano Zampini    Level: developer
43895b7e41feSStefano Zampini 
4390*11a5261eSBarry Smith    Note:
4391*11a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
43925b7e41feSStefano Zampini 
4393db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
43945b7e41feSStefano Zampini @*/
43959371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) {
4396ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4397ed502f03SStefano Zampini   CsrMatrix          *csr;
4398ed502f03SStefano Zampini 
4399ed502f03SStefano Zampini   PetscFunctionBegin;
4400ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4401ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4402ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4403aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
440528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4406ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
440728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4408ed502f03SStefano Zampini   *a = csr->values->data().get();
4409ed502f03SStefano Zampini   PetscFunctionReturn(0);
4410ed502f03SStefano Zampini }
4411ed502f03SStefano Zampini 
44125b7e41feSStefano Zampini /*@C
4413*11a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
44145b7e41feSStefano Zampini 
44155b7e41feSStefano Zampini    Not Collective
44165b7e41feSStefano Zampini 
44175b7e41feSStefano Zampini    Input Parameter:
4418*11a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44195b7e41feSStefano Zampini 
44205b7e41feSStefano Zampini    Output Parameter:
44215b7e41feSStefano Zampini .   a - pointer to the device data
44225b7e41feSStefano Zampini 
44235b7e41feSStefano Zampini    Level: developer
44245b7e41feSStefano Zampini 
4425db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
44265b7e41feSStefano Zampini @*/
44279371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) {
4428ed502f03SStefano Zampini   PetscFunctionBegin;
4429ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4430ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4431ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4432ed502f03SStefano Zampini   *a = NULL;
4433ed502f03SStefano Zampini   PetscFunctionReturn(0);
4434ed502f03SStefano Zampini }
4435ed502f03SStefano Zampini 
44365b7e41feSStefano Zampini /*@C
4437*11a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
44385b7e41feSStefano Zampini 
44395b7e41feSStefano Zampini    Not Collective
44405b7e41feSStefano Zampini 
44415b7e41feSStefano Zampini    Input Parameter:
4442*11a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44435b7e41feSStefano Zampini 
44445b7e41feSStefano Zampini    Output Parameter:
44455b7e41feSStefano Zampini .   a - pointer to the device data
44465b7e41feSStefano Zampini 
44475b7e41feSStefano Zampini    Level: developer
44485b7e41feSStefano Zampini 
4449*11a5261eSBarry Smith    Note:
4450*11a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44515b7e41feSStefano Zampini 
4452db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
44535b7e41feSStefano Zampini @*/
44549371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) {
4455039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4456039c6fbaSStefano Zampini   CsrMatrix          *csr;
4457039c6fbaSStefano Zampini 
4458039c6fbaSStefano Zampini   PetscFunctionBegin;
4459039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4460039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4461039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4462aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
446428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4465039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
446628b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4467039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4468039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
44699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4470039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4471039c6fbaSStefano Zampini }
44725b7e41feSStefano Zampini /*@C
4473*11a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4474039c6fbaSStefano Zampini 
44755b7e41feSStefano Zampini    Not Collective
44765b7e41feSStefano Zampini 
44775b7e41feSStefano Zampini    Input Parameter:
4478*11a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44795b7e41feSStefano Zampini 
44805b7e41feSStefano Zampini    Output Parameter:
44815b7e41feSStefano Zampini .   a - pointer to the device data
44825b7e41feSStefano Zampini 
44835b7e41feSStefano Zampini    Level: developer
44845b7e41feSStefano Zampini 
4485db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
44865b7e41feSStefano Zampini @*/
44879371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) {
4488039c6fbaSStefano Zampini   PetscFunctionBegin;
4489039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4490039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4491039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
44939566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4494039c6fbaSStefano Zampini   *a = NULL;
4495039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4496039c6fbaSStefano Zampini }
4497039c6fbaSStefano Zampini 
44985b7e41feSStefano Zampini /*@C
4499*11a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45005b7e41feSStefano Zampini 
45015b7e41feSStefano Zampini    Not Collective
45025b7e41feSStefano Zampini 
45035b7e41feSStefano Zampini    Input Parameter:
4504*11a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45055b7e41feSStefano Zampini 
45065b7e41feSStefano Zampini    Output Parameter:
45075b7e41feSStefano Zampini .   a - pointer to the device data
45085b7e41feSStefano Zampini 
45095b7e41feSStefano Zampini    Level: developer
45105b7e41feSStefano Zampini 
4511*11a5261eSBarry Smith    Note:
4512*11a5261eSBarry Smith    Does not trigger host-device copies and flags data validity on the GPU
45135b7e41feSStefano Zampini 
4514db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
45155b7e41feSStefano Zampini @*/
45169371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) {
4517ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4518ed502f03SStefano Zampini   CsrMatrix          *csr;
4519ed502f03SStefano Zampini 
4520ed502f03SStefano Zampini   PetscFunctionBegin;
4521ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4522ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4523ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4524aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
452528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4526ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
452728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4528ed502f03SStefano Zampini   *a             = csr->values->data().get();
4529039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4531ed502f03SStefano Zampini   PetscFunctionReturn(0);
4532ed502f03SStefano Zampini }
4533ed502f03SStefano Zampini 
45345b7e41feSStefano Zampini /*@C
4535*11a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
45365b7e41feSStefano Zampini 
45375b7e41feSStefano Zampini    Not Collective
45385b7e41feSStefano Zampini 
45395b7e41feSStefano Zampini    Input Parameter:
4540*11a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45415b7e41feSStefano Zampini 
45425b7e41feSStefano Zampini    Output Parameter:
45435b7e41feSStefano Zampini .   a - pointer to the device data
45445b7e41feSStefano Zampini 
45455b7e41feSStefano Zampini    Level: developer
45465b7e41feSStefano Zampini 
4547db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
45485b7e41feSStefano Zampini @*/
45499371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) {
4550ed502f03SStefano Zampini   PetscFunctionBegin;
4551ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4552ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4553ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45559566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4556ed502f03SStefano Zampini   *a = NULL;
4557ed502f03SStefano Zampini   PetscFunctionReturn(0);
4558ed502f03SStefano Zampini }
4559ed502f03SStefano Zampini 
45609371c9d4SSatish Balay struct IJCompare4 {
45619371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) {
4562ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4563ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4564ed502f03SStefano Zampini     return false;
4565ed502f03SStefano Zampini   }
4566ed502f03SStefano Zampini };
4567ed502f03SStefano Zampini 
45689371c9d4SSatish Balay struct Shift {
4569ed502f03SStefano Zampini   int _shift;
4570ed502f03SStefano Zampini 
4571ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
45729371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4573ed502f03SStefano Zampini };
4574ed502f03SStefano Zampini 
4575ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
45769371c9d4SSatish Balay PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) {
4577ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4578ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4579ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4580ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4581ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4582ed502f03SStefano Zampini   cusparseStatus_t              stat;
4583ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4584ed502f03SStefano Zampini 
4585ed502f03SStefano Zampini   PetscFunctionBegin;
4586ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4587ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4588ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4589ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
45915f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
459208401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4593aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4594aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4595ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4596ed502f03SStefano Zampini     m = A->rmap->n;
4597ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
45989566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
45999566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
46009566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4601ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4602ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4603ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4604ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4605ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4606ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4607ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4608ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4609ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4610ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4611ed502f03SStefano Zampini     Ccusp->nrows            = m;
4612ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4613ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4614ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4615ed502f03SStefano Zampini     Ccsr->num_cols          = n;
46169566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
46179566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
46189566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
46199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
46209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
46219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
46229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46269566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
462728b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
462828b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4629ed502f03SStefano Zampini 
4630ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4631ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4632ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4633ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4634ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4635ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4636ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4637ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4638ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4639ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4640ed502f03SStefano Zampini     if (c->nz) {
46412ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
46422ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
46432ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
46442ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
46452ed87e7eSStefano Zampini 
4646ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4647ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4648ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4649ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
46509566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4651ed502f03SStefano Zampini         }
46522ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
46532ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4654ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4655ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4656ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4657ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
46589566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4659ed502f03SStefano Zampini         }
46602ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
46612ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
46629566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
46639371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46649371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46659371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46669371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46672ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
46682ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
46692ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
46708909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4671ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4672ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
46738909a122SStefano Zampini #else
46748909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
46758909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
46768909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
46778909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
46788909a122SStefano Zampini #endif
46792ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
46802ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
46812ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
46822ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
46832ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
46842ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4685ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4686ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4687ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4688792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
46898909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
46908909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
46918909a122SStefano Zampini #endif
46922ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
46932ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
46942ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4695792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
46962ed87e7eSStefano Zampini #else
46972ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4698792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4699792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
47002ed87e7eSStefano Zampini #endif
47019371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47029371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47039566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
47042ed87e7eSStefano Zampini       delete wPerm;
47052ed87e7eSStefano Zampini       delete Acoo;
47062ed87e7eSStefano Zampini       delete Bcoo;
47072ed87e7eSStefano Zampini       delete Ccoo;
4708ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47099371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47109371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4711ed502f03SStefano Zampini #endif
47121a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
47139566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
47149566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4715ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4716ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4717ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4718ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4719ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4720ed502f03SStefano Zampini 
47211a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
47221a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4723a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4724ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4725ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4726ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4727ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4728ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4729ed502f03SStefano Zampini 
4730ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4731ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4732ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4733ed502f03SStefano Zampini 
47349566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4735ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4736ed502f03SStefano Zampini         if (AT) {
4737ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4738ed502f03SStefano Zampini           thrust::advance(rT, -1);
4739ed502f03SStefano Zampini         }
4740ed502f03SStefano Zampini         if (BT) {
4741ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4742ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4743ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4744ed502f03SStefano Zampini         }
4745ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4746ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4747ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4748ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4749ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4750ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
47519566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4752ed502f03SStefano Zampini 
47539566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
47549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
47559566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
47569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
47579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
47589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
47599566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47619566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4762ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47639371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47649371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4765ed502f03SStefano Zampini #endif
4766ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4767ed502f03SStefano Zampini       }
4768ed502f03SStefano Zampini     }
4769ed502f03SStefano Zampini 
4770ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4771ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4772ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
47739566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
47749566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
4775ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4776ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4777ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4778ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4779ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
47809566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47819566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4782ed502f03SStefano Zampini     } else {
47839566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
47849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4785ed502f03SStefano Zampini     }
47869566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
47879566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
47889566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4789ed502f03SStefano Zampini     c->maxnz         = c->nz;
4790ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4791ed502f03SStefano Zampini     c->rmax          = 0;
4792ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4793ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4794ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4795ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4796ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4797ed502f03SStefano Zampini     }
47989566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
47999566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4800ed502f03SStefano Zampini     (*C)->nonzerostate++;
48019566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
48029566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4803ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4804ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4805ed502f03SStefano Zampini   } else {
480608401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4807ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4808ed502f03SStefano Zampini     if (c->nz) {
4809ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
48105f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4811aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
481208401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
48139566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
48149566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
48155f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
48165f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4817ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4818ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4819ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4820aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4821aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4822aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4823aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
48245f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4825ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4826ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
48279566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48289371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
48299371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4830ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
48319371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
48329371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4833ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
48349566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
48351a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
48365f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4837ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4838ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4839ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4840ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4841ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4842ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4843ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48441a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4845ed502f03SStefano Zampini       }
48469566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4847ed502f03SStefano Zampini     }
4848ed502f03SStefano Zampini   }
48499566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4850ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4851ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4852ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4853ed502f03SStefano Zampini   PetscFunctionReturn(0);
4854ed502f03SStefano Zampini }
4855c215019aSStefano Zampini 
48569371c9d4SSatish Balay static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) {
4857c215019aSStefano Zampini   bool               dmem;
4858c215019aSStefano Zampini   const PetscScalar *av;
4859c215019aSStefano Zampini 
4860c215019aSStefano Zampini   PetscFunctionBegin;
4861c215019aSStefano Zampini   dmem = isCudaMem(v);
48629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4863c215019aSStefano Zampini   if (n && idx) {
4864c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4865c215019aSStefano Zampini     widx.assign(idx, idx + n);
48669566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4867c215019aSStefano Zampini 
4868c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4869c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4870c215019aSStefano Zampini     if (dmem) {
4871c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4872c215019aSStefano Zampini     } else {
4873c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4874c215019aSStefano Zampini       dv = w->data();
4875c215019aSStefano Zampini     }
4876c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4877c215019aSStefano Zampini 
4878c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4879c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4880c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
488148a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4882c215019aSStefano Zampini     delete w;
4883c215019aSStefano Zampini   } else {
48849566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4885c215019aSStefano Zampini   }
48869566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
48879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4888c215019aSStefano Zampini   PetscFunctionReturn(0);
4889c215019aSStefano Zampini }
4890