xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 35cb6cd333087cc89d8d5031932d4f38af02614d)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
17d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
18d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
19a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
20d0967f54SJacob Faibussowitsch #endif
21a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
22a2cee5feSJed Brown #include <thrust/remove.h>
23a2cee5feSJed Brown #include <thrust/sort.h>
24a2cee5feSJed Brown #include <thrust/unique.h>
25e8d2b73aSMark Adams 
26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30afb2bd1cSJunchao Zhang 
31afb2bd1cSJunchao Zhang   typedef enum {
32afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
36afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
37afb2bd1cSJunchao Zhang 
38afb2bd1cSJunchao Zhang   typedef enum {
39afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
50afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
51afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
52afb2bd1cSJunchao Zhang 
53afb2bd1cSJunchao Zhang   typedef enum {
54*35cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministic
55*35cb6cd3SPierre Jolivet       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministic
56afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
57afb2bd1cSJunchao Zhang   */
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61afb2bd1cSJunchao Zhang #endif
629ae82921SPaul Mullowney 
63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
686fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
696fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
70087f3262SPaul Mullowney 
716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
726fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
746fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
75dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
76a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7733c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
786fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
816fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
859ae82921SPaul Mullowney 
867f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
917f756511SDominic Meiser 
9257181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9457181aedSStefano Zampini 
95c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98c215019aSStefano Zampini 
99d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100d71ae5a4SJacob Faibussowitsch {
101aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1026e111a19SKarl Rupp 
103ca45077fSPaul Mullowney   PetscFunctionBegin;
104ca45077fSPaul Mullowney   switch (op) {
105d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
106d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
107d71ae5a4SJacob Faibussowitsch     break;
108d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
109d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
110d71ae5a4SJacob Faibussowitsch     break;
111d71ae5a4SJacob Faibussowitsch   default:
112d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113ca45077fSPaul Mullowney   }
114ca45077fSPaul Mullowney   PetscFunctionReturn(0);
115ca45077fSPaul Mullowney }
1169ae82921SPaul Mullowney 
117e057df02SPaul Mullowney /*@
11811a5261eSBarry Smith    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11911a5261eSBarry Smith    operation. Only the `MatMult()` operation can use different GPU storage formats
12011a5261eSBarry Smith 
121e057df02SPaul Mullowney    Not Collective
122e057df02SPaul Mullowney 
123e057df02SPaul Mullowney    Input Parameters:
12411a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
12511a5261eSBarry Smith .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
12611a5261eSBarry Smith         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12711a5261eSBarry Smith -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128e057df02SPaul Mullowney 
129e057df02SPaul Mullowney    Output Parameter:
130e057df02SPaul Mullowney 
131e057df02SPaul Mullowney    Level: intermediate
132e057df02SPaul Mullowney 
13311a5261eSBarry Smith .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
134e057df02SPaul Mullowney @*/
135d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
136d71ae5a4SJacob Faibussowitsch {
137e057df02SPaul Mullowney   PetscFunctionBegin;
138e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
139cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
140e057df02SPaul Mullowney   PetscFunctionReturn(0);
141e057df02SPaul Mullowney }
142e057df02SPaul Mullowney 
143d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
144d71ae5a4SJacob Faibussowitsch {
145365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
146365b711fSMark Adams 
147365b711fSMark Adams   PetscFunctionBegin;
148365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
149365b711fSMark Adams   PetscFunctionReturn(0);
150365b711fSMark Adams }
151365b711fSMark Adams 
152365b711fSMark Adams /*@
15311a5261eSBarry Smith    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
154365b711fSMark Adams 
155365b711fSMark Adams    Input Parameters:
15611a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
15711a5261eSBarry Smith -  use_cpu - set flag for using the built-in CPU `MatSolve()`
158365b711fSMark Adams 
159365b711fSMark Adams    Output Parameter:
160365b711fSMark Adams 
16111a5261eSBarry Smith    Note:
162365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
163365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
164365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
165365b711fSMark Adams 
166365b711fSMark Adams    Level: intermediate
167365b711fSMark Adams 
16811a5261eSBarry Smith .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
169365b711fSMark Adams @*/
170d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
171d71ae5a4SJacob Faibussowitsch {
172365b711fSMark Adams   PetscFunctionBegin;
173365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
174cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
175365b711fSMark Adams   PetscFunctionReturn(0);
176365b711fSMark Adams }
177365b711fSMark Adams 
178d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
179d71ae5a4SJacob Faibussowitsch {
180e6e9a74fSStefano Zampini   PetscFunctionBegin;
1811a2c6b5cSJunchao Zhang   switch (op) {
1821a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1831a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1849566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1851a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1861a2c6b5cSJunchao Zhang     break;
187d71ae5a4SJacob Faibussowitsch   default:
188d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
189d71ae5a4SJacob Faibussowitsch     break;
190e6e9a74fSStefano Zampini   }
191e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
192e6e9a74fSStefano Zampini }
193e6e9a74fSStefano Zampini 
194bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
195bddcd29dSMark Adams 
196d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
197d71ae5a4SJacob Faibussowitsch {
198bddcd29dSMark Adams   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
199bddcd29dSMark Adams   IS                  isrow = b->row, iscol = b->col;
200bddcd29dSMark Adams   PetscBool           row_identity, col_identity;
201365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
202bddcd29dSMark Adams 
203bddcd29dSMark Adams   PetscFunctionBegin;
2049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2059566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
206bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
207bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2089566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
2099566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
210f93f8571SJunchao Zhang 
211365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
212f93f8571SJunchao Zhang     if (row_identity && col_identity) {
213bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
214bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
215bddcd29dSMark Adams     } else {
216bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
217bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
218365b711fSMark Adams     }
219f93f8571SJunchao Zhang   }
220bddcd29dSMark Adams   B->ops->matsolve          = NULL;
221bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
222bddcd29dSMark Adams 
223bddcd29dSMark Adams   /* get the triangular factors */
22448a46eb9SPierre Jolivet   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
225bddcd29dSMark Adams   PetscFunctionReturn(0);
226bddcd29dSMark Adams }
227bddcd29dSMark Adams 
228d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
229d71ae5a4SJacob Faibussowitsch {
230e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2319ae82921SPaul Mullowney   PetscBool                flg;
232a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2336e111a19SKarl Rupp 
2349ae82921SPaul Mullowney   PetscFunctionBegin;
235d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
2369ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2379371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2389566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
239afb2bd1cSJunchao Zhang 
2409371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2419566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2429566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2439566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
244afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2459371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
246afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
248aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249a435da06SStefano Zampini   #else
250aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251a435da06SStefano Zampini   #endif
2529371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
253aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
254afb2bd1cSJunchao Zhang 
2559371c9d4SSatish Balay     PetscCall(
2569371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
257aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
258afb2bd1cSJunchao Zhang #endif
2594c87dfd4SPaul Mullowney   }
260d0609cedSBarry Smith   PetscOptionsHeadEnd();
2619ae82921SPaul Mullowney   PetscFunctionReturn(0);
2629ae82921SPaul Mullowney }
2639ae82921SPaul Mullowney 
264d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
265d71ae5a4SJacob Faibussowitsch {
2669ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
2679ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
2689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
269aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
2709ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
2719ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
2729ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2739ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
2749ae82921SPaul Mullowney 
2759ae82921SPaul Mullowney   PetscFunctionBegin;
276cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
277c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2789ae82921SPaul Mullowney     try {
2799ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2809ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
281da79fbbcSStefano Zampini       if (!loTriFactor) {
2822cbc15d9SMark         PetscScalar *AALo;
2832cbc15d9SMark 
2849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
2859ae82921SPaul Mullowney 
2869ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
2889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
2899ae82921SPaul Mullowney 
2909ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2919ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
2929ae82921SPaul Mullowney         AiLo[n]   = nzLower;
2939ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
2949ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
2959ae82921SPaul Mullowney         v         = aa;
2969ae82921SPaul Mullowney         vi        = aj;
2979ae82921SPaul Mullowney         offset    = 1;
2989ae82921SPaul Mullowney         rowOffset = 1;
2999ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3009ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
301e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3029ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3039ae82921SPaul Mullowney           rowOffset += nz + 1;
3049ae82921SPaul Mullowney 
3059566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
3069566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
3079ae82921SPaul Mullowney 
3089ae82921SPaul Mullowney           offset += nz;
3099ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3109ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3119ae82921SPaul Mullowney           offset += 1;
3129ae82921SPaul Mullowney 
3139ae82921SPaul Mullowney           v += nz;
3149ae82921SPaul Mullowney           vi += nz;
3159ae82921SPaul Mullowney         }
3162205254eSKarl Rupp 
317aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3189566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
319da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
320aa372e3fSPaul Mullowney         /* Create the matrix description */
3219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
325afb2bd1cSJunchao Zhang #else
3269566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
327afb2bd1cSJunchao Zhang #endif
3289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
330aa372e3fSPaul Mullowney 
331aa372e3fSPaul Mullowney         /* set the operation */
332aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
333aa372e3fSPaul Mullowney 
334aa372e3fSPaul Mullowney         /* set the matrix */
335aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
336aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
337aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
338aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
339aa372e3fSPaul Mullowney 
340aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
341aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
342aa372e3fSPaul Mullowney 
343aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
344aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
345aa372e3fSPaul Mullowney 
346aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
347aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
348aa372e3fSPaul Mullowney 
349afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3509566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
351261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3539371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3549371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
356afb2bd1cSJunchao Zhang #endif
357afb2bd1cSJunchao Zhang 
358aa372e3fSPaul Mullowney         /* perform the solve analysis */
3599371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3609f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
3619566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3629566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
363aa372e3fSPaul Mullowney 
364da79fbbcSStefano Zampini         /* assign the pointer */
365aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
3662cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
3679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3699566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
370da79fbbcSStefano Zampini       } else { /* update values only */
37148a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
372da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3732cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
374da79fbbcSStefano Zampini         v                    = aa;
375da79fbbcSStefano Zampini         vi                   = aj;
376da79fbbcSStefano Zampini         offset               = 1;
377da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
378da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
3799566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
380da79fbbcSStefano Zampini           offset += nz;
3812cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
382da79fbbcSStefano Zampini           offset += 1;
383da79fbbcSStefano Zampini           v += nz;
384da79fbbcSStefano Zampini         }
3852cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
3869566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
387da79fbbcSStefano Zampini       }
388d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
389d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
390d71ae5a4SJacob Faibussowitsch     }
3919ae82921SPaul Mullowney   }
3929ae82921SPaul Mullowney   PetscFunctionReturn(0);
3939ae82921SPaul Mullowney }
3949ae82921SPaul Mullowney 
395d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
396d71ae5a4SJacob Faibussowitsch {
3979ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
3989ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
3999ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
400aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4019ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4029ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4039ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4049ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4059ae82921SPaul Mullowney 
4069ae82921SPaul Mullowney   PetscFunctionBegin;
407cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
408c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4099ae82921SPaul Mullowney     try {
4109ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4119ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
412da79fbbcSStefano Zampini       if (!upTriFactor) {
4132cbc15d9SMark         PetscScalar *AAUp;
4142cbc15d9SMark 
4159566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4162cbc15d9SMark 
4179ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4189566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4199566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4209ae82921SPaul Mullowney 
4219ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4229ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4239ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4249ae82921SPaul Mullowney         offset  = nzUpper;
4259ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4269ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4279ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4289ae82921SPaul Mullowney 
429e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4309ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4319ae82921SPaul Mullowney 
432e057df02SPaul Mullowney           /* decrement the offset */
4339ae82921SPaul Mullowney           offset -= (nz + 1);
4349ae82921SPaul Mullowney 
435e057df02SPaul Mullowney           /* first, set the diagonal elements */
4369ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
43709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4389ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4399ae82921SPaul Mullowney 
4409566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4419566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4429ae82921SPaul Mullowney         }
4432205254eSKarl Rupp 
444aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4459566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
446da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4472205254eSKarl Rupp 
448aa372e3fSPaul Mullowney         /* Create the matrix description */
4499566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4509566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
453afb2bd1cSJunchao Zhang #else
4549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
455afb2bd1cSJunchao Zhang #endif
4569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
458aa372e3fSPaul Mullowney 
459aa372e3fSPaul Mullowney         /* set the operation */
460aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
461aa372e3fSPaul Mullowney 
462aa372e3fSPaul Mullowney         /* set the matrix */
463aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
464aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
465aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
466aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
467aa372e3fSPaul Mullowney 
468aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
469aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
470aa372e3fSPaul Mullowney 
471aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
472aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
473aa372e3fSPaul Mullowney 
474aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
475aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
476aa372e3fSPaul Mullowney 
477afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4789566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
479261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4819371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4829371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
4839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
484afb2bd1cSJunchao Zhang #endif
485afb2bd1cSJunchao Zhang 
486aa372e3fSPaul Mullowney         /* perform the solve analysis */
4879371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4889f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
4899f7ba44dSJacob Faibussowitsch 
4909566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4919566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
492aa372e3fSPaul Mullowney 
493da79fbbcSStefano Zampini         /* assign the pointer */
494aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
4952cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
4969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
4979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
4989566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
499da79fbbcSStefano Zampini       } else {
50048a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
501da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
502da79fbbcSStefano Zampini         offset = nzUpper;
503da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
504da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
505da79fbbcSStefano Zampini 
506da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
507da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
508da79fbbcSStefano Zampini 
509da79fbbcSStefano Zampini           /* decrement the offset */
510da79fbbcSStefano Zampini           offset -= (nz + 1);
511da79fbbcSStefano Zampini 
512da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5132cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5149566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
515da79fbbcSStefano Zampini         }
5162cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5179566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
518da79fbbcSStefano Zampini       }
519d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
520d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
521d71ae5a4SJacob Faibussowitsch     }
5229ae82921SPaul Mullowney   }
5239ae82921SPaul Mullowney   PetscFunctionReturn(0);
5249ae82921SPaul Mullowney }
5259ae82921SPaul Mullowney 
526d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
527d71ae5a4SJacob Faibussowitsch {
5289ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5299ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5309ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5319ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5329ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5339ae82921SPaul Mullowney 
5349ae82921SPaul Mullowney   PetscFunctionBegin;
53528b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
5369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5379566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5382205254eSKarl Rupp 
539ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
540aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5419ae82921SPaul Mullowney 
542c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
543e057df02SPaul Mullowney   /* lower triangular indices */
5449566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
545da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
546da79fbbcSStefano Zampini     const PetscInt *r;
547da79fbbcSStefano Zampini 
5489566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
549aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
550aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5519566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5529566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
553da79fbbcSStefano Zampini   }
5549ae82921SPaul Mullowney 
555e057df02SPaul Mullowney   /* upper triangular indices */
5569566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
557da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
558da79fbbcSStefano Zampini     const PetscInt *c;
559da79fbbcSStefano Zampini 
5609566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
561aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
562aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
5639566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
5649566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
565da79fbbcSStefano Zampini   }
5669ae82921SPaul Mullowney   PetscFunctionReturn(0);
5679ae82921SPaul Mullowney }
5689ae82921SPaul Mullowney 
569d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
570d71ae5a4SJacob Faibussowitsch {
571087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
572087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
573aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
574aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
575087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
576087f3262SPaul Mullowney   PetscScalar                       *AAUp;
577087f3262SPaul Mullowney   PetscScalar                       *AALo;
578087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
579087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
580087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
581087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
582087f3262SPaul Mullowney 
583087f3262SPaul Mullowney   PetscFunctionBegin;
584cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
585c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
586087f3262SPaul Mullowney     try {
5879566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
5889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
589da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
590087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5919566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
5929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
593087f3262SPaul Mullowney 
594087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
595087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
596087f3262SPaul Mullowney         AiUp[n] = nzUpper;
597087f3262SPaul Mullowney         offset  = 0;
598087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
599087f3262SPaul Mullowney           /* set the pointers */
600087f3262SPaul Mullowney           v  = aa + ai[i];
601087f3262SPaul Mullowney           vj = aj + ai[i];
602087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
603087f3262SPaul Mullowney 
604087f3262SPaul Mullowney           /* first, set the diagonal elements */
605087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
60609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
607087f3262SPaul Mullowney           AiUp[i]      = offset;
60809f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
609087f3262SPaul Mullowney 
610087f3262SPaul Mullowney           offset += 1;
611087f3262SPaul Mullowney           if (nz > 0) {
6129566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
6139566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
614087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
615087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
616087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
617087f3262SPaul Mullowney             }
618087f3262SPaul Mullowney             offset += nz;
619087f3262SPaul Mullowney           }
620087f3262SPaul Mullowney         }
621087f3262SPaul Mullowney 
622aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6239566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
624da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
625087f3262SPaul Mullowney 
626aa372e3fSPaul Mullowney         /* Create the matrix description */
6279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
631afb2bd1cSJunchao Zhang #else
6329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
633afb2bd1cSJunchao Zhang #endif
6349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6359566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
636087f3262SPaul Mullowney 
637aa372e3fSPaul Mullowney         /* set the matrix */
638aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
639aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
640aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
641aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
642aa372e3fSPaul Mullowney 
643aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
644aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
645aa372e3fSPaul Mullowney 
646aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
647aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
648aa372e3fSPaul Mullowney 
649aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
650aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
651aa372e3fSPaul Mullowney 
652afb2bd1cSJunchao Zhang         /* set the operation */
653afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
654afb2bd1cSJunchao Zhang 
655afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6569566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
657261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6599371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6609371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
6619566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
662afb2bd1cSJunchao Zhang #endif
663afb2bd1cSJunchao Zhang 
664aa372e3fSPaul Mullowney         /* perform the solve analysis */
6659371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6669f7ba44dSJacob Faibussowitsch                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
6679f7ba44dSJacob Faibussowitsch 
6689566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6699566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
670aa372e3fSPaul Mullowney 
671da79fbbcSStefano Zampini         /* assign the pointer */
672aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
673aa372e3fSPaul Mullowney 
674aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6759566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
676da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
677aa372e3fSPaul Mullowney 
678aa372e3fSPaul Mullowney         /* Create the matrix description */
6799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
6809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6829566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
683afb2bd1cSJunchao Zhang #else
6849566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
685afb2bd1cSJunchao Zhang #endif
6869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
688aa372e3fSPaul Mullowney 
689aa372e3fSPaul Mullowney         /* set the operation */
690aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
691aa372e3fSPaul Mullowney 
692aa372e3fSPaul Mullowney         /* set the matrix */
693aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
694aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
695aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
696aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
697aa372e3fSPaul Mullowney 
698aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
699aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
700aa372e3fSPaul Mullowney 
701aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
702aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
703aa372e3fSPaul Mullowney 
704aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
705aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
706aa372e3fSPaul Mullowney 
707afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7089566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
709261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
7101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7119371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7129371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
7139566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
714afb2bd1cSJunchao Zhang #endif
715afb2bd1cSJunchao Zhang 
716aa372e3fSPaul Mullowney         /* perform the solve analysis */
7179371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7189f7ba44dSJacob Faibussowitsch                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
7199f7ba44dSJacob Faibussowitsch 
7209566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7219566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
722aa372e3fSPaul Mullowney 
723da79fbbcSStefano Zampini         /* assign the pointer */
724aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
725087f3262SPaul Mullowney 
7269566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
7279566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
729da79fbbcSStefano Zampini       } else {
730da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
731da79fbbcSStefano Zampini         offset = 0;
732da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
733da79fbbcSStefano Zampini           /* set the pointers */
734da79fbbcSStefano Zampini           v  = aa + ai[i];
735da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
736da79fbbcSStefano Zampini 
737da79fbbcSStefano Zampini           /* first, set the diagonal elements */
738da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
739da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
740da79fbbcSStefano Zampini 
741da79fbbcSStefano Zampini           offset += 1;
742da79fbbcSStefano Zampini           if (nz > 0) {
7439566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
744da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
745da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
746da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
747da79fbbcSStefano Zampini             }
748da79fbbcSStefano Zampini             offset += nz;
749da79fbbcSStefano Zampini           }
750da79fbbcSStefano Zampini         }
75128b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
75228b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
753da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
754da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
7559566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
756da79fbbcSStefano Zampini       }
7579566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
7589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
759d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
760d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
761d71ae5a4SJacob Faibussowitsch     }
762087f3262SPaul Mullowney   }
763087f3262SPaul Mullowney   PetscFunctionReturn(0);
764087f3262SPaul Mullowney }
765087f3262SPaul Mullowney 
766d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
767d71ae5a4SJacob Faibussowitsch {
768087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
769087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
770087f3262SPaul Mullowney   IS                            ip                 = a->row;
771087f3262SPaul Mullowney   PetscBool                     perm_identity;
772087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
773087f3262SPaul Mullowney 
774087f3262SPaul Mullowney   PetscFunctionBegin;
77528b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
7769566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
777ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
778aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
779aa372e3fSPaul Mullowney 
780da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
781da79fbbcSStefano Zampini 
782087f3262SPaul Mullowney   /* lower triangular indices */
7839566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
784087f3262SPaul Mullowney   if (!perm_identity) {
7854e4bbfaaSStefano Zampini     IS              iip;
786da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
7874e4bbfaaSStefano Zampini 
7889566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
7899566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
7909566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
791aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
792aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
793aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
7944e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
7959566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
7969566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
7979566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
7989566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
799da79fbbcSStefano Zampini   }
800087f3262SPaul Mullowney   PetscFunctionReturn(0);
801087f3262SPaul Mullowney }
802087f3262SPaul Mullowney 
803d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
804d71ae5a4SJacob Faibussowitsch {
805087f3262SPaul Mullowney   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
806087f3262SPaul Mullowney   IS          ip = b->row;
807087f3262SPaul Mullowney   PetscBool   perm_identity;
808087f3262SPaul Mullowney 
809087f3262SPaul Mullowney   PetscFunctionBegin;
8109566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
8119566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
812ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
813087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8149566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
815087f3262SPaul Mullowney   if (perm_identity) {
816087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
817087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8184e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8194e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
820087f3262SPaul Mullowney   } else {
821087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
822087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
8234e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8244e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
825087f3262SPaul Mullowney   }
826087f3262SPaul Mullowney 
827087f3262SPaul Mullowney   /* get the triangular factors */
8289566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
829087f3262SPaul Mullowney   PetscFunctionReturn(0);
830087f3262SPaul Mullowney }
8319ae82921SPaul Mullowney 
832d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
833d71ae5a4SJacob Faibussowitsch {
834bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
835aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
836aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
837da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
838da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
839aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
840aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
841aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
842aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
843b175d8bbSPaul Mullowney 
844bda325fcSPaul Mullowney   PetscFunctionBegin;
845aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8469566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
847da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
848aa372e3fSPaul Mullowney 
849aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
850aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
851aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
8529371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
853aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
854aa372e3fSPaul Mullowney 
855aa372e3fSPaul Mullowney   /* Create the matrix description */
8569566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
8579566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
8589566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
8599566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
8609566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
861aa372e3fSPaul Mullowney 
862aa372e3fSPaul Mullowney   /* set the operation */
863aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
864aa372e3fSPaul Mullowney 
865aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
866aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
867afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
868afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
869aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
870afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
871afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
872afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
873aa372e3fSPaul Mullowney 
874aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
875afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8769371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
8779371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
8789371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
8799566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
880afb2bd1cSJunchao Zhang #endif
881afb2bd1cSJunchao Zhang 
8829566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
8839f7ba44dSJacob Faibussowitsch   {
8849f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
8859f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
8869371c9d4SSatish Balay                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
887afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8889f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);
889afb2bd1cSJunchao Zhang #else
8909f7ba44dSJacob Faibussowitsch                                  loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
891afb2bd1cSJunchao Zhang #endif
8929f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
8939f7ba44dSJacob Faibussowitsch   }
8949f7ba44dSJacob Faibussowitsch 
8959566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
8969566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
897aa372e3fSPaul Mullowney 
898afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
8999566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
900261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
9011b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9029371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
9039371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
9049566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
905afb2bd1cSJunchao Zhang #endif
906afb2bd1cSJunchao Zhang 
907afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9089371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
9099f7ba44dSJacob Faibussowitsch                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
9109f7ba44dSJacob Faibussowitsch 
9119566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9129566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
913aa372e3fSPaul Mullowney 
914da79fbbcSStefano Zampini   /* assign the pointer */
915aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
916aa372e3fSPaul Mullowney 
917aa372e3fSPaul Mullowney   /*********************************************/
918aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
919aa372e3fSPaul Mullowney   /*********************************************/
920aa372e3fSPaul Mullowney 
921aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9229566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
923da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
924aa372e3fSPaul Mullowney 
925aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
926aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
927aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
9289371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
929aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
930aa372e3fSPaul Mullowney 
931aa372e3fSPaul Mullowney   /* Create the matrix description */
9329566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9339566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
9349566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
9359566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
9369566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
937aa372e3fSPaul Mullowney 
938aa372e3fSPaul Mullowney   /* set the operation */
939aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
940aa372e3fSPaul Mullowney 
941aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
942aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
943afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
944afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
945aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
946afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
947afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
948afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
949aa372e3fSPaul Mullowney 
950aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
951afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9529371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
9539371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
9549371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
9559566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
956afb2bd1cSJunchao Zhang #endif
957afb2bd1cSJunchao Zhang 
9589566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9599f7ba44dSJacob Faibussowitsch   {
9609f7ba44dSJacob Faibussowitsch     // there is no clean way to have PetscCallCUSPARSE wrapping this function...
9619f7ba44dSJacob Faibussowitsch     auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
9629371c9d4SSatish Balay                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
963afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9649f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);
965afb2bd1cSJunchao Zhang #else
9669f7ba44dSJacob Faibussowitsch                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
967afb2bd1cSJunchao Zhang #endif
9689f7ba44dSJacob Faibussowitsch     PetscCallCUSPARSE(stat);
9699f7ba44dSJacob Faibussowitsch   }
970d49cd2b7SBarry Smith 
9719566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9729566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
973aa372e3fSPaul Mullowney 
974afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9759566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
976261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
9771b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9789371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9799371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
9809566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
981afb2bd1cSJunchao Zhang #endif
982afb2bd1cSJunchao Zhang 
983afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9845f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
9859371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9869f7ba44dSJacob Faibussowitsch                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
987d49cd2b7SBarry Smith 
9889566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9899566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
990aa372e3fSPaul Mullowney 
991da79fbbcSStefano Zampini   /* assign the pointer */
992aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
993bda325fcSPaul Mullowney   PetscFunctionReturn(0);
994bda325fcSPaul Mullowney }
995bda325fcSPaul Mullowney 
9969371c9d4SSatish Balay struct PetscScalarToPetscInt {
9979371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
998a49f1ed0SStefano Zampini };
999a49f1ed0SStefano Zampini 
1000d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1001d71ae5a4SJacob Faibussowitsch {
1002aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1003a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1004bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1005bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1006aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1007b175d8bbSPaul Mullowney 
1008bda325fcSPaul Mullowney   PetscFunctionBegin;
10099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1010a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
101128b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1012a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
101308401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
10141a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
10159566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
101748a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1018a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1019aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
10209566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1021aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
10229566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
10239566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1024aa372e3fSPaul Mullowney 
1025b06137fdSPaul Mullowney     /* set alpha and beta */
10269566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
10279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
10289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
10299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1032b06137fdSPaul Mullowney 
1033aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1034aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1035a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1036554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1037554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1038aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1039a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1040aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1041aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1042a3fdcf43SKarl Rupp 
1043ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
104481902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1045afb2bd1cSJunchao Zhang 
1046afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10473606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
10489371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
10499371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
10509371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
10513606e59fSJunchao Zhang   #else
10523606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
10533606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
10543606e59fSJunchao Zhang 
10553606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
10563606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
10573606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
10583606e59fSJunchao Zhang         */
10593606e59fSJunchao Zhang       if (matrixT->num_entries) {
10609371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
10619371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
10623606e59fSJunchao Zhang 
10633606e59fSJunchao Zhang       } else {
10643606e59fSJunchao Zhang         matstructT->matDescr = NULL;
10653606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
10663606e59fSJunchao Zhang       }
10673606e59fSJunchao Zhang   #endif
1068afb2bd1cSJunchao Zhang #endif
1069aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1070afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1071afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1072afb2bd1cSJunchao Zhang #else
1073aa372e3fSPaul Mullowney       CsrMatrix *temp = new CsrMatrix;
107451c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
107551c6d536SStefano Zampini       /* First convert HYB to CSR */
1076aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1077aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1078aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1079aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1080aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1081aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1082aa372e3fSPaul Mullowney 
10839371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
10849371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1085aa372e3fSPaul Mullowney 
1086aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1087aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1088aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1089aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1090aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1091aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1092aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1093aa372e3fSPaul Mullowney 
10949371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
10959371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
10969371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1097aa372e3fSPaul Mullowney 
1098aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1099aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
11009566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
11019371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
11029371c9d4SSatish Balay       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
11039371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1104aa372e3fSPaul Mullowney 
1105aa372e3fSPaul Mullowney       /* assign the pointer */
1106aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
11071a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1108aa372e3fSPaul Mullowney       /* delete temporaries */
1109aa372e3fSPaul Mullowney       if (tempT) {
1110aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1111aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1112aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1113aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1114087f3262SPaul Mullowney       }
1115aa372e3fSPaul Mullowney       if (temp) {
1116aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1117aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1118aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1119aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1120aa372e3fSPaul Mullowney       }
1121afb2bd1cSJunchao Zhang #endif
1122aa372e3fSPaul Mullowney     }
1123a49f1ed0SStefano Zampini   }
1124a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1125a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1126a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
112728b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
112828b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
112928b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
113028b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
113128b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
113228b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
113328b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
113428b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1135a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1136a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1137a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
11389566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1139a49f1ed0SStefano Zampini     }
1140a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1141a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1142792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1143a49f1ed0SStefano Zampini 
1144a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1145a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1146a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1147a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
11489371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
11499371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
11509371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
11519566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1152a49f1ed0SStefano Zampini #endif
1153a49f1ed0SStefano Zampini 
11541a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
11551a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
11561a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
11571a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
11581a2c6b5cSJunchao Zhang 
11591a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
11601a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
11611a2c6b5cSJunchao Zhang         */
11629371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1163a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11649371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
11659371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1166a49f1ed0SStefano Zampini #else
11679371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11689371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1169a49f1ed0SStefano Zampini #endif
11701a2c6b5cSJunchao Zhang       } else {
11711a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
11721a2c6b5cSJunchao Zhang       }
11731a2c6b5cSJunchao Zhang 
1174a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1175792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1176a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11779566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1178a49f1ed0SStefano Zampini #endif
1179a49f1ed0SStefano Zampini     }
11809371c9d4SSatish Balay     PetscCallThrust(
11819371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1182a49f1ed0SStefano Zampini   }
11839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
11849566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1185213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1186213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1187aa372e3fSPaul Mullowney   /* assign the pointer */
1188aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
11891a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
1190bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1191bda325fcSPaul Mullowney }
1192bda325fcSPaul Mullowney 
1193a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1194d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1195d71ae5a4SJacob Faibussowitsch {
1196c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1197465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1198465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1199465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1200465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1201bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1202aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1203aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1204aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1205bda325fcSPaul Mullowney 
1206bda325fcSPaul Mullowney   PetscFunctionBegin;
1207aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1208aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12099566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1210aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1211aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1212bda325fcSPaul Mullowney   }
1213bda325fcSPaul Mullowney 
1214bda325fcSPaul Mullowney   /* Get the GPU pointers */
12159566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12169566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1217c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1218c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1219bda325fcSPaul Mullowney 
12209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1221aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12229371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1223aa372e3fSPaul Mullowney 
1224aa372e3fSPaul Mullowney   /* First, solve U */
12259f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12269f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1227aa372e3fSPaul Mullowney 
1228aa372e3fSPaul Mullowney   /* Then, solve L */
12299f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
12309f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1231aa372e3fSPaul Mullowney 
1232aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
12339371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1234aa372e3fSPaul Mullowney 
1235aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1236a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1237bda325fcSPaul Mullowney 
1238bda325fcSPaul Mullowney   /* restore */
12399566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12409566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12429566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1243bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1244bda325fcSPaul Mullowney }
1245bda325fcSPaul Mullowney 
1246d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1247d71ae5a4SJacob Faibussowitsch {
1248465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1249465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1250bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1251aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1252aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1253aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1254bda325fcSPaul Mullowney 
1255bda325fcSPaul Mullowney   PetscFunctionBegin;
1256aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1257aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12589566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1259aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1260aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1261bda325fcSPaul Mullowney   }
1262bda325fcSPaul Mullowney 
1263bda325fcSPaul Mullowney   /* Get the GPU pointers */
12649566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12659566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1266bda325fcSPaul Mullowney 
12679566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1268aa372e3fSPaul Mullowney   /* First, solve U */
12699f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
12709f7ba44dSJacob Faibussowitsch                                          upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1271aa372e3fSPaul Mullowney 
1272aa372e3fSPaul Mullowney   /* Then, solve L */
12739f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
12749f7ba44dSJacob Faibussowitsch                                          loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1275bda325fcSPaul Mullowney 
1276bda325fcSPaul Mullowney   /* restore */
12779566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12789566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1281bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1282bda325fcSPaul Mullowney }
1283bda325fcSPaul Mullowney 
1284d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1285d71ae5a4SJacob Faibussowitsch {
1286465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1287465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1288465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1289465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
12909ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1291aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1292aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1293aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
12949ae82921SPaul Mullowney 
12959ae82921SPaul Mullowney   PetscFunctionBegin;
1296e057df02SPaul Mullowney   /* Get the GPU pointers */
12979566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12989566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1299c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1300c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
13019ae82921SPaul Mullowney 
13029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1303aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
13049371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1305aa372e3fSPaul Mullowney 
1306aa372e3fSPaul Mullowney   /* Next, solve L */
13079f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
13089f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1309aa372e3fSPaul Mullowney 
1310aa372e3fSPaul Mullowney   /* Then, solve U */
13119f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
13129f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer));
1313d49cd2b7SBarry Smith 
13144e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
13159371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
13169ae82921SPaul Mullowney 
13179566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13189566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13199566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13209566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13219ae82921SPaul Mullowney   PetscFunctionReturn(0);
13229ae82921SPaul Mullowney }
13239ae82921SPaul Mullowney 
1324d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1325d71ae5a4SJacob Faibussowitsch {
1326465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1327465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
13289ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1329aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1330aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1331aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13329ae82921SPaul Mullowney 
13339ae82921SPaul Mullowney   PetscFunctionBegin;
1334e057df02SPaul Mullowney   /* Get the GPU pointers */
13359566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13369566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
13379ae82921SPaul Mullowney 
13389566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1339aa372e3fSPaul Mullowney   /* First, solve L */
13409f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
13419f7ba44dSJacob Faibussowitsch                                          loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer));
1342d49cd2b7SBarry Smith 
1343aa372e3fSPaul Mullowney   /* Next, solve U */
13449f7ba44dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
13459f7ba44dSJacob Faibussowitsch                                          upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
13469ae82921SPaul Mullowney 
13479566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13489566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13499566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
13519ae82921SPaul Mullowney   PetscFunctionReturn(0);
13529ae82921SPaul Mullowney }
13539ae82921SPaul Mullowney 
1354da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1355da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1356d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1357d71ae5a4SJacob Faibussowitsch {
1358da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1359da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1360da112707SJunchao Zhang   const PetscScalar            *barray;
1361da112707SJunchao Zhang   PetscScalar                  *xarray;
1362da112707SJunchao Zhang 
1363da112707SJunchao Zhang   PetscFunctionBegin;
1364da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1365da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1366da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1367da112707SJunchao Zhang 
1368da112707SJunchao Zhang   /* Solve L*y = b */
1369da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1370da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
13719371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
13729371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
137312ba2bc6SJunchao Zhang                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1374da112707SJunchao Zhang 
1375da112707SJunchao Zhang   /* Solve U*x = y */
1376da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
13779371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
13789371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1379da112707SJunchao Zhang 
1380da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1381da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1382da112707SJunchao Zhang 
1383da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1384da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1385da112707SJunchao Zhang   PetscFunctionReturn(0);
1386da112707SJunchao Zhang }
1387da112707SJunchao Zhang 
1388d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1389d71ae5a4SJacob Faibussowitsch {
1390da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1391da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1392da112707SJunchao Zhang   const PetscScalar            *barray;
1393da112707SJunchao Zhang   PetscScalar                  *xarray;
1394da112707SJunchao Zhang 
1395da112707SJunchao Zhang   PetscFunctionBegin;
139612ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1397da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1398*35cb6cd3SPierre Jolivet     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */
13999371c9d4SSatish Balay                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1400da112707SJunchao Zhang 
1401da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
14029371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1403da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
140412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
140512ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
140612ba2bc6SJunchao Zhang   }
1407da112707SJunchao Zhang 
140812ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
14099371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1410da112707SJunchao Zhang 
14119371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
141212ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1413da112707SJunchao Zhang   }
1414da112707SJunchao Zhang 
1415da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1416da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1417da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1418da112707SJunchao Zhang 
1419da112707SJunchao Zhang   /* Solve Ut*y = b */
1420da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1421da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14229371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
14239371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1424da112707SJunchao Zhang 
1425da112707SJunchao Zhang   /* Solve Lt*x = y */
1426da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14279371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
14289371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1429da112707SJunchao Zhang 
1430da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1431da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1432da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1433da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1434da112707SJunchao Zhang   PetscFunctionReturn(0);
1435da112707SJunchao Zhang }
1436da112707SJunchao Zhang 
1437d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1438d71ae5a4SJacob Faibussowitsch {
1439da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1440da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1441da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1442da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1443da112707SJunchao Zhang   PetscInt                      m, nz;
1444da112707SJunchao Zhang   PetscBool                     flg;
1445da112707SJunchao Zhang 
1446da112707SJunchao Zhang   PetscFunctionBegin;
1447da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1448da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1449da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1450da112707SJunchao Zhang   }
1451da112707SJunchao Zhang 
1452da112707SJunchao Zhang   /* Copy A's value to fact */
1453da112707SJunchao Zhang   m  = fact->rmap->n;
1454da112707SJunchao Zhang   nz = aij->nz;
1455da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1456da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1457da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1458da112707SJunchao Zhang 
1459da112707SJunchao Zhang   /* Factorize fact inplace */
14609371c9d4SSatish Balay   if (m)
14619371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
14629371c9d4SSatish Balay                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1463da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1464da112707SJunchao Zhang     int              numerical_zero;
1465da112707SJunchao Zhang     cusparseStatus_t status;
1466da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1467da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1468da112707SJunchao Zhang   }
1469da112707SJunchao Zhang 
147012ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
147112ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
147212ba2bc6SJunchao Zhang   */
14739371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1474da112707SJunchao Zhang 
14759371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1476da112707SJunchao Zhang 
147712ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
147812ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
147912ba2bc6SJunchao Zhang 
1480da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1481da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1482da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1483da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1484da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1485da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1486da112707SJunchao Zhang   PetscFunctionReturn(0);
1487da112707SJunchao Zhang }
1488da112707SJunchao Zhang 
1489d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1490d71ae5a4SJacob Faibussowitsch {
1491da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1492da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1493da112707SJunchao Zhang   PetscInt                      m, nz;
1494da112707SJunchao Zhang 
1495da112707SJunchao Zhang   PetscFunctionBegin;
1496da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1497da112707SJunchao Zhang     PetscInt  i;
1498da112707SJunchao Zhang     PetscBool flg, missing;
1499da112707SJunchao Zhang 
1500da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1501da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1502da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1503da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1504da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1505da112707SJunchao Zhang   }
1506da112707SJunchao Zhang 
1507da112707SJunchao Zhang   /* Free the old stale stuff */
1508da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1509da112707SJunchao Zhang 
1510da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1511da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1512da112707SJunchao Zhang    */
1513da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1514da112707SJunchao Zhang 
1515da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1516da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1517da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1518da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1519da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1520da112707SJunchao Zhang 
1521da112707SJunchao Zhang   aij->row = NULL;
1522da112707SJunchao Zhang   aij->col = NULL;
1523da112707SJunchao Zhang 
1524da112707SJunchao Zhang   /* ====================================================================== */
1525da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1526da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1527da112707SJunchao Zhang   /* ====================================================================== */
1528da112707SJunchao Zhang   const int *Ai, *Aj;
1529da112707SJunchao Zhang 
1530da112707SJunchao Zhang   m  = fact->rmap->n;
1531da112707SJunchao Zhang   nz = aij->nz;
1532da112707SJunchao Zhang 
1533da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1534da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1535da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1536da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1537da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1538da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1539da112707SJunchao Zhang 
1540da112707SJunchao Zhang   /* ====================================================================== */
1541da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1542da112707SJunchao Zhang   /* ====================================================================== */
1543da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1544da112707SJunchao Zhang   cusparseDiagType_t diagType;
1545da112707SJunchao Zhang 
1546da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1547da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1548da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1549da112707SJunchao Zhang 
1550da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1551da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1552da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1553da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1554da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1555da112707SJunchao Zhang   */
1556da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1557da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
15589371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
15599371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
15609371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1561da112707SJunchao Zhang 
1562da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1563da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
15649371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
15659371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
15669371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1567da112707SJunchao Zhang 
1568da112707SJunchao Zhang   /* ========================================================================= */
1569da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1570da112707SJunchao Zhang   /* ========================================================================= */
1571da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
15729371c9d4SSatish Balay   if (m)
15739371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
15749371c9d4SSatish Balay                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1575da112707SJunchao Zhang 
1576da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1577da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1578da112707SJunchao Zhang 
1579da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1580da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1581da112707SJunchao Zhang 
1582da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
15839371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1584da112707SJunchao Zhang 
1585da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
15869371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1587da112707SJunchao Zhang 
1588da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
158912ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
159012ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
159112ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1592da112707SJunchao Zhang    */
159312ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
159412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
159512ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1596da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
159712ba2bc6SJunchao Zhang   } else {
159812ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
159912ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1600da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
160112ba2bc6SJunchao Zhang   }
1602da112707SJunchao Zhang 
1603da112707SJunchao Zhang   /* ========================================================================== */
1604da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1605da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1606da112707SJunchao Zhang   /* ========================================================================== */
1607da112707SJunchao Zhang   int              structural_zero;
1608da112707SJunchao Zhang   cusparseStatus_t status;
1609da112707SJunchao Zhang 
1610da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
16119371c9d4SSatish Balay   if (m)
16129371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16139371c9d4SSatish Balay                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1614da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1615da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1616da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1617da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1618da112707SJunchao Zhang   }
1619da112707SJunchao Zhang 
1620da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
16210dd8c0acSJunchao Zhang   {
1622da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
16230dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1624da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1625da112707SJunchao Zhang 
1626da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1627da112707SJunchao Zhang     Ai    = Aseq->i;
1628da112707SJunchao Zhang     Adiag = Aseq->diag;
1629da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1630da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1631da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1632da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1633da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1634da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1635da112707SJunchao Zhang         */
1636da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1637da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1638da112707SJunchao Zhang       }
1639da112707SJunchao Zhang     }
1640da112707SJunchao Zhang     fs->numericFactFlops = flops;
16410dd8c0acSJunchao Zhang   }
1642da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1643da112707SJunchao Zhang   PetscFunctionReturn(0);
1644da112707SJunchao Zhang }
1645da112707SJunchao Zhang 
1646d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1647d71ae5a4SJacob Faibussowitsch {
1648da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1649da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1650da112707SJunchao Zhang   const PetscScalar            *barray;
1651da112707SJunchao Zhang   PetscScalar                  *xarray;
1652da112707SJunchao Zhang 
1653da112707SJunchao Zhang   PetscFunctionBegin;
1654da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1655da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1656da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1657da112707SJunchao Zhang 
1658da112707SJunchao Zhang   /* Solve L*y = b */
1659da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1660da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
16619371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
16629371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1663da112707SJunchao Zhang 
1664da112707SJunchao Zhang   /* Solve Lt*x = y */
1665da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
16669371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
16679371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1668da112707SJunchao Zhang 
1669da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1670da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1671da112707SJunchao Zhang 
1672da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1673da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1674da112707SJunchao Zhang   PetscFunctionReturn(0);
1675da112707SJunchao Zhang }
1676da112707SJunchao Zhang 
1677d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1678d71ae5a4SJacob Faibussowitsch {
1679da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1680da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1681da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1682da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1683da112707SJunchao Zhang   PetscInt                      m, nz;
1684da112707SJunchao Zhang   PetscBool                     flg;
1685da112707SJunchao Zhang 
1686da112707SJunchao Zhang   PetscFunctionBegin;
1687da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1688da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1689da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1690da112707SJunchao Zhang   }
1691da112707SJunchao Zhang 
1692da112707SJunchao Zhang   /* Copy A's value to fact */
1693da112707SJunchao Zhang   m  = fact->rmap->n;
1694da112707SJunchao Zhang   nz = aij->nz;
1695da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1696da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1697da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1698da112707SJunchao Zhang 
1699da112707SJunchao Zhang   /* Factorize fact inplace */
1700da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1701da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1702da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1703da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1704da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1705da112707SJunchao Zhang    */
17069371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1707da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1708da112707SJunchao Zhang     int              numerical_zero;
1709da112707SJunchao Zhang     cusparseStatus_t status;
1710da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1711da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1712da112707SJunchao Zhang   }
1713da112707SJunchao Zhang 
17149371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1715da112707SJunchao Zhang 
1716da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1717da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1718da112707SJunchao Zhang   */
17199371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1720da112707SJunchao Zhang 
1721da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1722da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1723da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1724da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1725da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1726da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1727da112707SJunchao Zhang   PetscFunctionReturn(0);
1728da112707SJunchao Zhang }
1729da112707SJunchao Zhang 
1730d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1731d71ae5a4SJacob Faibussowitsch {
1732da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1733da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1734da112707SJunchao Zhang   PetscInt                      m, nz;
1735da112707SJunchao Zhang 
1736da112707SJunchao Zhang   PetscFunctionBegin;
1737da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1738da112707SJunchao Zhang     PetscInt  i;
1739da112707SJunchao Zhang     PetscBool flg, missing;
1740da112707SJunchao Zhang 
1741da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1742da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1743da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1744da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1745da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1746da112707SJunchao Zhang   }
1747da112707SJunchao Zhang 
1748da112707SJunchao Zhang   /* Free the old stale stuff */
1749da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1750da112707SJunchao Zhang 
1751da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1752da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1753da112707SJunchao Zhang    */
1754da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1755da112707SJunchao Zhang 
1756da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1757da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
1758da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1759da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1760da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1761da112707SJunchao Zhang 
1762da112707SJunchao Zhang   aij->row = NULL;
1763da112707SJunchao Zhang   aij->col = NULL;
1764da112707SJunchao Zhang 
1765da112707SJunchao Zhang   /* ====================================================================== */
1766da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1767da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1768da112707SJunchao Zhang   /* ====================================================================== */
1769da112707SJunchao Zhang   const int *Ai, *Aj;
1770da112707SJunchao Zhang 
1771da112707SJunchao Zhang   m  = fact->rmap->n;
1772da112707SJunchao Zhang   nz = aij->nz;
1773da112707SJunchao Zhang 
1774da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1775da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1776da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1777da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1778da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1779da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1780da112707SJunchao Zhang 
1781da112707SJunchao Zhang   /* ====================================================================== */
1782da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
1783da112707SJunchao Zhang   /* ====================================================================== */
1784da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1785da112707SJunchao Zhang   cusparseDiagType_t diagType;
1786da112707SJunchao Zhang 
1787da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1788da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1789da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1790da112707SJunchao Zhang 
1791da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1792da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1793da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1794da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1795da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1796da112707SJunchao Zhang   */
1797da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1798da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
17999371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
18009371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
18019371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1802da112707SJunchao Zhang 
1803da112707SJunchao Zhang   /* ========================================================================= */
1804da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1805da112707SJunchao Zhang   /* ========================================================================= */
1806da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
18079371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1808da112707SJunchao Zhang 
1809da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1810da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1811da112707SJunchao Zhang 
1812da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1813da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1814da112707SJunchao Zhang 
1815da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
18169371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1817da112707SJunchao Zhang 
1818da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
18199371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1820da112707SJunchao Zhang 
182112ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
182212ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
182312ba2bc6SJunchao Zhang    */
182412ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
182512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
182612ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1827da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
182812ba2bc6SJunchao Zhang   } else {
182912ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
183012ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
183112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
183212ba2bc6SJunchao Zhang   }
1833da112707SJunchao Zhang 
1834da112707SJunchao Zhang   /* ========================================================================== */
1835da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
1836da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
1837da112707SJunchao Zhang   /* ========================================================================== */
1838da112707SJunchao Zhang   int              structural_zero;
1839da112707SJunchao Zhang   cusparseStatus_t status;
1840da112707SJunchao Zhang 
1841da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
18429371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1843da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1844da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1845da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1846da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1847da112707SJunchao Zhang   }
1848da112707SJunchao Zhang 
1849da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
18500dd8c0acSJunchao Zhang   {
1851da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
18520dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
1853da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1854da112707SJunchao Zhang 
1855da112707SJunchao Zhang     Ai = Aseq->i;
1856da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1857da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
1858da112707SJunchao Zhang       if (nzRow > 1) {
1859da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1860da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1861da112707SJunchao Zhang         */
1862da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1863da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1864da112707SJunchao Zhang       }
1865da112707SJunchao Zhang     }
1866da112707SJunchao Zhang     fs->numericFactFlops = flops;
18670dd8c0acSJunchao Zhang   }
1868da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1869da112707SJunchao Zhang   PetscFunctionReturn(0);
1870da112707SJunchao Zhang }
1871da112707SJunchao Zhang #endif
1872da112707SJunchao Zhang 
1873d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1874d71ae5a4SJacob Faibussowitsch {
1875da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1876da112707SJunchao Zhang 
1877da112707SJunchao Zhang   PetscFunctionBegin;
1878da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1879bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1880bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
1881da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
1882da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
1883bc996fdcSJunchao Zhang   }
1884da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
1885da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1886da112707SJunchao Zhang   } else
1887da112707SJunchao Zhang #endif
1888da112707SJunchao Zhang   {
1889da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1890da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1891da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1892da112707SJunchao Zhang   }
1893da112707SJunchao Zhang   PetscFunctionReturn(0);
1894da112707SJunchao Zhang }
1895da112707SJunchao Zhang 
1896d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1897d71ae5a4SJacob Faibussowitsch {
1898da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1899da112707SJunchao Zhang 
1900da112707SJunchao Zhang   PetscFunctionBegin;
1901da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1902da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1903da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1904da112707SJunchao Zhang   PetscFunctionReturn(0);
1905da112707SJunchao Zhang }
1906da112707SJunchao Zhang 
1907d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1908d71ae5a4SJacob Faibussowitsch {
1909da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1910da112707SJunchao Zhang 
1911da112707SJunchao Zhang   PetscFunctionBegin;
1912da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1913bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
1914bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
1915da112707SJunchao Zhang   if (!info->levels && perm_identity) {
1916da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
1917da112707SJunchao Zhang   } else
1918da112707SJunchao Zhang #endif
1919da112707SJunchao Zhang   {
1920da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1921da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
1922da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1923da112707SJunchao Zhang   }
1924da112707SJunchao Zhang   PetscFunctionReturn(0);
1925da112707SJunchao Zhang }
1926da112707SJunchao Zhang 
1927d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
1928d71ae5a4SJacob Faibussowitsch {
1929da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1930da112707SJunchao Zhang 
1931da112707SJunchao Zhang   PetscFunctionBegin;
1932da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1933da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
1934da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
1935da112707SJunchao Zhang   PetscFunctionReturn(0);
1936da112707SJunchao Zhang }
1937da112707SJunchao Zhang 
1938d71ae5a4SJacob Faibussowitsch PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type)
1939d71ae5a4SJacob Faibussowitsch {
1940841d4cb1SJunchao Zhang   PetscFunctionBegin;
1941841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
1942841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
1943841d4cb1SJunchao Zhang }
1944841d4cb1SJunchao Zhang 
1945841d4cb1SJunchao Zhang /*MC
1946841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
194711a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
1948841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
1949841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
195011a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
1951841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
1952841d4cb1SJunchao Zhang 
1953841d4cb1SJunchao Zhang   Level: beginner
1954841d4cb1SJunchao Zhang 
195511a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
1956841d4cb1SJunchao Zhang M*/
1957841d4cb1SJunchao Zhang 
1958d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
1959d71ae5a4SJacob Faibussowitsch {
1960841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
1961bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
1962bc996fdcSJunchao Zhang   char     *prefix;
1963bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
1964841d4cb1SJunchao Zhang 
1965841d4cb1SJunchao Zhang   PetscFunctionBegin;
1966841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
1967841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
1968841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
1969841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
1970841d4cb1SJunchao Zhang 
1971bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
1972bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
1973bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
1974bc996fdcSJunchao Zhang   PetscOptionsEnd();
1975bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
1976bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
1977bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
1978bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
1979bc996fdcSJunchao Zhang 
1980841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
1981841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1982841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
1983841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
1984841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1985841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1986841d4cb1SJunchao Zhang     } else {
1987841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1988841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1989841d4cb1SJunchao Zhang     }
1990841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
1991841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1992841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
1993841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1994841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
1995841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
1996841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1997841d4cb1SJunchao Zhang     } else {
1998841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1999841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2000841d4cb1SJunchao Zhang     }
2001841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2002841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2003841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2004841d4cb1SJunchao Zhang 
2005841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2006841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2007841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2008841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2009841d4cb1SJunchao Zhang }
2010841d4cb1SJunchao Zhang 
2011d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2012d71ae5a4SJacob Faibussowitsch {
20137e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
20147e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
20150dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500
2016da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
20170dd8c0acSJunchao Zhang #endif
20187e8381f9SStefano Zampini 
20197e8381f9SStefano Zampini   PetscFunctionBegin;
20207e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
20219566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2022da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2023da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
20249566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2025da112707SJunchao Zhang     }
2026da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500
2027da112707SJunchao Zhang     else if (fs->csrVal) {
2028da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2029da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2030da112707SJunchao Zhang     }
2031da112707SJunchao Zhang #endif
20329371c9d4SSatish Balay     else
20339371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
20349566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
20359566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
20367e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
20377e8381f9SStefano Zampini   }
20387e8381f9SStefano Zampini   PetscFunctionReturn(0);
20397e8381f9SStefano Zampini }
20407e8381f9SStefano Zampini 
2041d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2042d71ae5a4SJacob Faibussowitsch {
20437e8381f9SStefano Zampini   PetscFunctionBegin;
20449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
204567a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
204667a45760SJunchao Zhang   PetscFunctionReturn(0);
204767a45760SJunchao Zhang }
204867a45760SJunchao Zhang 
2049d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2050d71ae5a4SJacob Faibussowitsch {
205167a45760SJunchao Zhang   PetscFunctionBegin;
20527e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
205367a45760SJunchao Zhang   *array         = NULL;
205467a45760SJunchao Zhang   PetscFunctionReturn(0);
205567a45760SJunchao Zhang }
205667a45760SJunchao Zhang 
2057d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2058d71ae5a4SJacob Faibussowitsch {
205967a45760SJunchao Zhang   PetscFunctionBegin;
20609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
206167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
206267a45760SJunchao Zhang   PetscFunctionReturn(0);
206367a45760SJunchao Zhang }
206467a45760SJunchao Zhang 
2065d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2066d71ae5a4SJacob Faibussowitsch {
206767a45760SJunchao Zhang   PetscFunctionBegin;
206867a45760SJunchao Zhang   *array = NULL;
206967a45760SJunchao Zhang   PetscFunctionReturn(0);
207067a45760SJunchao Zhang }
207167a45760SJunchao Zhang 
2072d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2073d71ae5a4SJacob Faibussowitsch {
207467a45760SJunchao Zhang   PetscFunctionBegin;
207567a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
207667a45760SJunchao Zhang   PetscFunctionReturn(0);
207767a45760SJunchao Zhang }
207867a45760SJunchao Zhang 
2079d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2080d71ae5a4SJacob Faibussowitsch {
208167a45760SJunchao Zhang   PetscFunctionBegin;
208267a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
208367a45760SJunchao Zhang   *array         = NULL;
20847e8381f9SStefano Zampini   PetscFunctionReturn(0);
20857e8381f9SStefano Zampini }
20867e8381f9SStefano Zampini 
2087d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2088d71ae5a4SJacob Faibussowitsch {
20897ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
20907ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
20917ee59b9bSJunchao Zhang 
20927ee59b9bSJunchao Zhang   PetscFunctionBegin;
20937ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
20947ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
20957ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
20967ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
20977ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
20987ee59b9bSJunchao Zhang 
20997ee59b9bSJunchao Zhang   if (i) {
21007ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21017ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
21027ee59b9bSJunchao Zhang #else
21037ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21047ee59b9bSJunchao Zhang #endif
21057ee59b9bSJunchao Zhang   }
21067ee59b9bSJunchao Zhang   if (j) {
21077ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
21087ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
21097ee59b9bSJunchao Zhang #else
21107ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
21117ee59b9bSJunchao Zhang #endif
21127ee59b9bSJunchao Zhang   }
21137ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
21147ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
21157ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
21167ee59b9bSJunchao Zhang }
21177ee59b9bSJunchao Zhang 
2118d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2119d71ae5a4SJacob Faibussowitsch {
2120aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
21217c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
21229ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2123213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2124aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2125abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
21269ae82921SPaul Mullowney 
21279ae82921SPaul Mullowney   PetscFunctionBegin;
212828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2129c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2130a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2131a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2132afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
213385ba7357SStefano Zampini 
213408401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
21359566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2136afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
21379566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
21389566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
21399566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21409566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
214134d6c7a5SJose E. Roman     } else {
2142abb89eb1SStefano Zampini       PetscInt nnz;
21439566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
21449566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
21459566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
21467c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
214781902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2148a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2149a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
21509ae82921SPaul Mullowney       try {
21519ae82921SPaul Mullowney         if (a->compressedrow.use) {
21529ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
21539ae82921SPaul Mullowney           ii   = a->compressedrow.i;
21549ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
21559ae82921SPaul Mullowney         } else {
2156213423ffSJunchao Zhang           m    = A->rmap->n;
2157213423ffSJunchao Zhang           ii   = a->i;
2158e6e9a74fSStefano Zampini           ridx = NULL;
21599ae82921SPaul Mullowney         }
216008401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
21619371c9d4SSatish Balay         if (!a->a) {
21629371c9d4SSatish Balay           nnz  = ii[m];
21639371c9d4SSatish Balay           both = PETSC_FALSE;
21649371c9d4SSatish Balay         } else nnz = a->nz;
216508401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
21669ae82921SPaul Mullowney 
216785ba7357SStefano Zampini         /* create cusparse matrix */
2168abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2169aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
21709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
21719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
21729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
21739ae82921SPaul Mullowney 
21749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
21759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
21769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
21779566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
21809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2181b06137fdSPaul Mullowney 
2182aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2183aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2184aa372e3fSPaul Mullowney           /* set the matrix */
2185afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2186afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2187afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2188abb89eb1SStefano Zampini           mat->num_entries = nnz;
2189afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2190afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
21919ae82921SPaul Mullowney 
2192abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2193abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2194aa372e3fSPaul Mullowney 
2195abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2196abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2197aa372e3fSPaul Mullowney 
2198aa372e3fSPaul Mullowney           /* assign the pointer */
2199afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2200afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2201afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
22029371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
22039371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
22049371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2205afb2bd1cSJunchao Zhang           }
2206afb2bd1cSJunchao Zhang #endif
2207aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2208afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2209afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2210afb2bd1cSJunchao Zhang #else
2211afb2bd1cSJunchao Zhang           CsrMatrix *mat = new CsrMatrix;
2212afb2bd1cSJunchao Zhang           mat->num_rows = m;
2213afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2214abb89eb1SStefano Zampini           mat->num_entries = nnz;
2215afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2216afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2217aa372e3fSPaul Mullowney 
2218abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2219abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2220aa372e3fSPaul Mullowney 
2221abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2222abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2223aa372e3fSPaul Mullowney 
2224aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
22259566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
22269371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
22279371c9d4SSatish Balay           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
22289371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2229aa372e3fSPaul Mullowney           /* assign the pointer */
2230aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2231aa372e3fSPaul Mullowney 
2232afb2bd1cSJunchao Zhang           if (mat) {
2233afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2234afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2235afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2236afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2237087f3262SPaul Mullowney           }
2238afb2bd1cSJunchao Zhang #endif
2239087f3262SPaul Mullowney         }
2240ca45077fSPaul Mullowney 
2241aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2242213423ffSJunchao Zhang         if (a->compressedrow.use) {
2243213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2244aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2245aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2246213423ffSJunchao Zhang           tmp = m;
2247213423ffSJunchao Zhang         } else {
2248213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2249213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2250213423ffSJunchao Zhang           tmp                        = 0;
2251213423ffSJunchao Zhang         }
22529566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2253aa372e3fSPaul Mullowney 
2254aa372e3fSPaul Mullowney         /* assign the pointer */
2255aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2256d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2257d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2258d71ae5a4SJacob Faibussowitsch       }
22599566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
22609566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
226134d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
226234d6c7a5SJose E. Roman     }
2263abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
22649ae82921SPaul Mullowney   }
22659ae82921SPaul Mullowney   PetscFunctionReturn(0);
22669ae82921SPaul Mullowney }
22679ae82921SPaul Mullowney 
22689371c9d4SSatish Balay struct VecCUDAPlusEquals {
2269aa372e3fSPaul Mullowney   template <typename Tuple>
2270d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2271d71ae5a4SJacob Faibussowitsch   {
2272aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2273aa372e3fSPaul Mullowney   }
2274aa372e3fSPaul Mullowney };
2275aa372e3fSPaul Mullowney 
22769371c9d4SSatish Balay struct VecCUDAEquals {
22777e8381f9SStefano Zampini   template <typename Tuple>
2278d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2279d71ae5a4SJacob Faibussowitsch   {
22807e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
22817e8381f9SStefano Zampini   }
22827e8381f9SStefano Zampini };
22837e8381f9SStefano Zampini 
22849371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2285e6e9a74fSStefano Zampini   template <typename Tuple>
2286d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2287d71ae5a4SJacob Faibussowitsch   {
2288e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2289e6e9a74fSStefano Zampini   }
2290e6e9a74fSStefano Zampini };
2291e6e9a74fSStefano Zampini 
2292afb2bd1cSJunchao Zhang struct MatMatCusparse {
2293ccdfe979SStefano Zampini   PetscBool      cisdense;
2294ccdfe979SStefano Zampini   PetscScalar   *Bt;
2295ccdfe979SStefano Zampini   Mat            X;
2296fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2297fcdce8c4SStefano Zampini   PetscLogDouble flops;
2298fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2299b4285af6SJunchao Zhang 
2300afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2301fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2302afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2303afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2304afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2305afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2306b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2307b4285af6SJunchao Zhang   void *dBuffer4;
2308b4285af6SJunchao Zhang   void *dBuffer5;
2309b4285af6SJunchao Zhang   #endif
2310fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2311fcdce8c4SStefano Zampini   void                 *mmBuffer;
2312fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2313fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2314afb2bd1cSJunchao Zhang #endif
2315afb2bd1cSJunchao Zhang };
2316ccdfe979SStefano Zampini 
2317d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2318d71ae5a4SJacob Faibussowitsch {
2319ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2320ccdfe979SStefano Zampini 
2321ccdfe979SStefano Zampini   PetscFunctionBegin;
23229566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2323fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2324afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
23259566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
23269566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
23279566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
23289566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2329b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
23309566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
23319566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2332b4285af6SJunchao Zhang   #endif
23339566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
23349566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2335afb2bd1cSJunchao Zhang #endif
23369566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
23379566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2338ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2339ccdfe979SStefano Zampini }
2340ccdfe979SStefano Zampini 
2341ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2342ccdfe979SStefano Zampini 
2343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2344d71ae5a4SJacob Faibussowitsch {
2345ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2346ccdfe979SStefano Zampini   Mat                           A, B;
2347afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2348ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2349ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2350ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2351ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2352ccdfe979SStefano Zampini   const PetscScalar            *barray;
2353ccdfe979SStefano Zampini   PetscScalar                  *carray;
2354ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2355ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2356ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2357ccdfe979SStefano Zampini 
2358ccdfe979SStefano Zampini   PetscFunctionBegin;
2359ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
236028b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2361ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2362ccdfe979SStefano Zampini   A      = product->A;
2363ccdfe979SStefano Zampini   B      = product->B;
23649566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
236528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2366ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2367ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
236828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
23699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2370ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2371ccdfe979SStefano Zampini   switch (product->type) {
2372ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2373ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2374ccdfe979SStefano Zampini     mat = cusp->mat;
2375ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2376ccdfe979SStefano Zampini     m   = A->rmap->n;
2377ccdfe979SStefano Zampini     n   = B->cmap->n;
2378ccdfe979SStefano Zampini     break;
2379ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
23801a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2381e6e9a74fSStefano Zampini       mat = cusp->mat;
2382e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2383e6e9a74fSStefano Zampini     } else {
23849566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2385ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2386ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2387e6e9a74fSStefano Zampini     }
2388ccdfe979SStefano Zampini     m = A->cmap->n;
2389ccdfe979SStefano Zampini     n = B->cmap->n;
2390ccdfe979SStefano Zampini     break;
2391ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2392ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2393ccdfe979SStefano Zampini     mat = cusp->mat;
2394ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2395ccdfe979SStefano Zampini     m   = A->rmap->n;
2396ccdfe979SStefano Zampini     n   = B->rmap->n;
2397ccdfe979SStefano Zampini     break;
2398d71ae5a4SJacob Faibussowitsch   default:
2399d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2400ccdfe979SStefano Zampini   }
240128b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2402ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2403ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
24049566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
24059566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
24069566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2407afb2bd1cSJunchao Zhang 
24089566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2409c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
24109566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
24119566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2412c8378d12SStefano Zampini   } else {
24139566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
24149566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2415c8378d12SStefano Zampini   }
2416c8378d12SStefano Zampini 
24179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2418afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2419afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2420a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2421afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2422fcdce8c4SStefano Zampini     size_t mmBufferSize;
24239371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
24249371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
24259371c9d4SSatish Balay       mmdata->matBDescr = NULL;
24269371c9d4SSatish Balay     }
2427afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
24289566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2429afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2430afb2bd1cSJunchao Zhang     }
2431c8378d12SStefano Zampini 
24329371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
24339371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
24349371c9d4SSatish Balay       mmdata->matCDescr = NULL;
24359371c9d4SSatish Balay     }
2436afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
24379566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2438afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2439afb2bd1cSJunchao Zhang     }
2440afb2bd1cSJunchao Zhang 
2441afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
24429371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
24439371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
24449371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2445afb2bd1cSJunchao Zhang     }
24469371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
24479371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2448fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
24499566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
24509566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2451fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2452fcdce8c4SStefano Zampini     }
2453afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2454afb2bd1cSJunchao Zhang   } else {
2455afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
24569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
24579566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
24589566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2459afb2bd1cSJunchao Zhang   }
2460afb2bd1cSJunchao Zhang 
2461afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
24629371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
24639371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2464afb2bd1cSJunchao Zhang #else
2465afb2bd1cSJunchao Zhang   PetscInt k;
2466afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2467ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2468ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2469ccdfe979SStefano Zampini     cublasStatus_t cerr;
2470ccdfe979SStefano Zampini 
24719566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
24729371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
24739371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2474ccdfe979SStefano Zampini     blda = B->cmap->n;
2475afb2bd1cSJunchao Zhang     k = B->cmap->n;
2476afb2bd1cSJunchao Zhang   } else {
2477afb2bd1cSJunchao Zhang     k = B->rmap->n;
2478ccdfe979SStefano Zampini   }
2479ccdfe979SStefano Zampini 
2480afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
24819371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
24829371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2483afb2bd1cSJunchao Zhang #endif
24849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
24859566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
24869566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2487ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
24889566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
24899566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2490ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
24919566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
24929566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2493ccdfe979SStefano Zampini   } else {
24949566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2495ccdfe979SStefano Zampini   }
249648a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
249748a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2498ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2499ccdfe979SStefano Zampini }
2500ccdfe979SStefano Zampini 
2501d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2502d71ae5a4SJacob Faibussowitsch {
2503ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2504ccdfe979SStefano Zampini   Mat                 A, B;
2505ccdfe979SStefano Zampini   PetscInt            m, n;
2506ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2507ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2508ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2509ccdfe979SStefano Zampini 
2510ccdfe979SStefano Zampini   PetscFunctionBegin;
2511ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
251228b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2513ccdfe979SStefano Zampini   A = product->A;
2514ccdfe979SStefano Zampini   B = product->B;
25159566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
251628b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2517ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
251808401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2519ccdfe979SStefano Zampini   switch (product->type) {
2520ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2521ccdfe979SStefano Zampini     m = A->rmap->n;
2522ccdfe979SStefano Zampini     n = B->cmap->n;
2523ccdfe979SStefano Zampini     break;
2524ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2525ccdfe979SStefano Zampini     m = A->cmap->n;
2526ccdfe979SStefano Zampini     n = B->cmap->n;
2527ccdfe979SStefano Zampini     break;
2528ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2529ccdfe979SStefano Zampini     m = A->rmap->n;
2530ccdfe979SStefano Zampini     n = B->rmap->n;
2531ccdfe979SStefano Zampini     break;
2532ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2533ccdfe979SStefano Zampini     m = B->cmap->n;
2534ccdfe979SStefano Zampini     n = B->cmap->n;
2535ccdfe979SStefano Zampini     break;
2536ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2537ccdfe979SStefano Zampini     m = B->rmap->n;
2538ccdfe979SStefano Zampini     n = B->rmap->n;
2539ccdfe979SStefano Zampini     break;
2540d71ae5a4SJacob Faibussowitsch   default:
2541d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2542ccdfe979SStefano Zampini   }
25439566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2544ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
25459566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
25469566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2547ccdfe979SStefano Zampini 
2548ccdfe979SStefano Zampini   /* product data */
25499566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2550ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2551afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2552afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
255348a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2554afb2bd1cSJunchao Zhang #endif
2555ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2556ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
25579566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
25589566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2559ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
25609566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2561ccdfe979SStefano Zampini     } else {
25629566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2563ccdfe979SStefano Zampini     }
2564ccdfe979SStefano Zampini   }
2565ccdfe979SStefano Zampini   C->product->data    = mmdata;
2566ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2567ccdfe979SStefano Zampini 
2568ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2569ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2570ccdfe979SStefano Zampini }
2571ccdfe979SStefano Zampini 
2572d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2573d71ae5a4SJacob Faibussowitsch {
2574ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2575fcdce8c4SStefano Zampini   Mat                           A, B;
2576fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2577fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2578fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2579fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2580fcdce8c4SStefano Zampini   PetscBool                     flg;
2581fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2582fcdce8c4SStefano Zampini   MatProductType                ptype;
2583fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2584fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2585fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2586fcdce8c4SStefano Zampini #endif
2587b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2588ccdfe979SStefano Zampini 
2589ccdfe979SStefano Zampini   PetscFunctionBegin;
2590ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
259128b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
25929566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
259328b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2594fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2595fcdce8c4SStefano Zampini   A      = product->A;
2596fcdce8c4SStefano Zampini   B      = product->B;
2597fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2598fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2599fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
260008401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2601fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
260228b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2603fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
260428b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2605fcdce8c4SStefano Zampini     goto finalize;
2606fcdce8c4SStefano Zampini   }
2607fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
26089566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
260928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
26109566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
261128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
261228b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
261328b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2614fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2615fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2616fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
261708401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
261808401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
261908401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
26209566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
26219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2622fcdce8c4SStefano Zampini 
2623fcdce8c4SStefano Zampini   ptype = product->type;
2624b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2625fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
262628b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2627fa046f9fSJunchao Zhang   }
2628b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2629fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
263028b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2631fa046f9fSJunchao Zhang   }
2632fcdce8c4SStefano Zampini   switch (ptype) {
2633fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2634fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2635fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2636fcdce8c4SStefano Zampini     break;
2637fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2638fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2639fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2640fcdce8c4SStefano Zampini     break;
2641fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2642fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2643fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2644fcdce8c4SStefano Zampini     break;
2645d71ae5a4SJacob Faibussowitsch   default:
2646d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2647fcdce8c4SStefano Zampini   }
2648fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
264928b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
265028b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
265128b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2652fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2653fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2654fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
265528b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
265628b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
265728b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
26589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2659fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2660fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
26619566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2662b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
26639371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
26649371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2665b4285af6SJunchao Zhang   #else
26669371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
26679371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
26689371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
26699371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2670b4285af6SJunchao Zhang   #endif
2671fcdce8c4SStefano Zampini #else
26729371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
26739371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
26749371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2675fcdce8c4SStefano Zampini #endif
26769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
26779566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
26789566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2679fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2680fcdce8c4SStefano Zampini finalize:
2681fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
26829566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
26839566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
26849566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2685fcdce8c4SStefano Zampini   c->reallocs = 0;
2686fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2687fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2688fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2689fcdce8c4SStefano Zampini   C->num_ass++;
2690ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2691ccdfe979SStefano Zampini }
2692fcdce8c4SStefano Zampini 
2693d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2694d71ae5a4SJacob Faibussowitsch {
2695fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2696fcdce8c4SStefano Zampini   Mat                           A, B;
2697fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2698fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2699fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2700fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2701fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
2702fcdce8c4SStefano Zampini   PetscBool                     flg;
2703fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2704fcdce8c4SStefano Zampini   MatProductType                ptype;
2705fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2706fcdce8c4SStefano Zampini   PetscLogDouble                flops;
2707fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
2708fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2709fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2710fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2711fcdce8c4SStefano Zampini #else
2712fcdce8c4SStefano Zampini   int cnz;
2713fcdce8c4SStefano Zampini #endif
2714b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2715fcdce8c4SStefano Zampini 
2716fcdce8c4SStefano Zampini   PetscFunctionBegin;
2717fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
271828b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2719fcdce8c4SStefano Zampini   A = product->A;
2720fcdce8c4SStefano Zampini   B = product->B;
27219566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
272228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
27239566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
272428b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2725fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
2726fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
2727fcdce8c4SStefano Zampini   /* product data */
27289566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2729fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2730fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2731fcdce8c4SStefano Zampini 
27329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
27339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2734d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2735d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
273608401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
273708401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2738d60bce21SJunchao Zhang 
2739fcdce8c4SStefano Zampini   ptype = product->type;
2740b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2741fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2742fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2743fa046f9fSJunchao Zhang   }
2744b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2745fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2746fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2747fa046f9fSJunchao Zhang   }
2748fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2749fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2750fcdce8c4SStefano Zampini   switch (ptype) {
2751fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2752fcdce8c4SStefano Zampini     m    = A->rmap->n;
2753fcdce8c4SStefano Zampini     n    = B->cmap->n;
2754fcdce8c4SStefano Zampini     k    = A->cmap->n;
2755fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2756fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2757fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2758fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2759fcdce8c4SStefano Zampini     break;
2760fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2761fcdce8c4SStefano Zampini     m = A->cmap->n;
2762fcdce8c4SStefano Zampini     n = B->cmap->n;
2763fcdce8c4SStefano Zampini     k = A->rmap->n;
27649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2765fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2766fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2767fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2768fcdce8c4SStefano Zampini     break;
2769fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2770fcdce8c4SStefano Zampini     m = A->rmap->n;
2771fcdce8c4SStefano Zampini     n = B->rmap->n;
2772fcdce8c4SStefano Zampini     k = A->cmap->n;
27739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2774fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2775fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2776fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2777fcdce8c4SStefano Zampini     break;
2778d71ae5a4SJacob Faibussowitsch   default:
2779d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2780fcdce8c4SStefano Zampini   }
2781fcdce8c4SStefano Zampini 
2782fcdce8c4SStefano Zampini   /* create cusparse matrix */
27839566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
27849566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2785fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
2786fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2787fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2788fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2789fcdce8c4SStefano Zampini 
2790fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2791fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2792fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
27939566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
27949566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2795fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2796fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2797fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2798fcdce8c4SStefano Zampini   } else {
2799fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2800fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2801fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2802fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2803fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2804fcdce8c4SStefano Zampini   }
2805fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2806fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
2807fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
2808fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2809fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2810fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
28119566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
28129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
28139566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
28149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
28159566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
28169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
28179566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28189566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
28199566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2820fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2821fcdce8c4SStefano Zampini     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2822fcdce8c4SStefano Zampini     c->nz                = 0;
2823fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2824fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
2825fcdce8c4SStefano Zampini     goto finalizesym;
2826fcdce8c4SStefano Zampini   }
2827fcdce8c4SStefano Zampini 
282828b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
282928b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2830fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2831fcdce8c4SStefano Zampini   if (!biscompressed) {
2832fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
2833fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2834fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2835fcdce8c4SStefano Zampini #endif
2836fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2837fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2838fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
2839fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2840fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2841fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2842fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2843fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2844fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2845fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2846fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
28479566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2848fcdce8c4SStefano Zampini     }
2849fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2850fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
2851fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2852fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
28539371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28549371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2855fcdce8c4SStefano Zampini     }
2856fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2857fcdce8c4SStefano Zampini #endif
2858fcdce8c4SStefano Zampini   }
285928b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
286028b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2861fcdce8c4SStefano Zampini   /* precompute flops count */
2862fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2863fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2864fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2865fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
2866fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
2867fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2868fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2869fcdce8c4SStefano Zampini       }
2870fcdce8c4SStefano Zampini     }
2871fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2872fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2873fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
2874fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2875fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
2876fcdce8c4SStefano Zampini     }
2877fcdce8c4SStefano Zampini   } else { /* TODO */
2878fcdce8c4SStefano Zampini     flops = 0.;
2879fcdce8c4SStefano Zampini   }
2880fcdce8c4SStefano Zampini 
2881fcdce8c4SStefano Zampini   mmdata->flops = flops;
28829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2883b4285af6SJunchao Zhang 
2884fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28859566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
28869371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
28879371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
28889566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2889b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2890b4285af6SJunchao Zhang   {
2891b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2892b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2893b4285af6SJunchao Zhang   */
2894b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
2895b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
2896b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
2897b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2898b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
2899b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
2900b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
2901b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
2902b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
2903b4285af6SJunchao Zhang 
2904b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2905b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
29069371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
29079371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
2909b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
29109371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
29119371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2912b4285af6SJunchao Zhang 
2913b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29149371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
29159371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
29179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
29189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
29199371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
29209371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
29229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
2923b4285af6SJunchao Zhang 
2924b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
2925b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
29269566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2927b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
2928b4285af6SJunchao Zhang     /* allocate matrix C */
29299371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29309371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29319371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
29329371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2933b4285af6SJunchao Zhang     /* update matC with the new pointers */
29349371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29359371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2936b4285af6SJunchao Zhang 
2937b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
29389371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
29399371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
29419371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
29429371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
29449371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
29469566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
2947b4285af6SJunchao Zhang   }
2948ae37ee31SJunchao Zhang   #else
2949b4285af6SJunchao Zhang   size_t bufSize2;
2950fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
29519371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
29529371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29539566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
2954fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
29559371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
29569371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2957fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
29589371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
29599371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2960fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2961fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2962fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2963fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2964fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
29659566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
2966fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
29679371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
29689371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2969fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
29709566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2971fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
29729371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
29739371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
2974fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29759566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2976fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
29779566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
29789371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
29799371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29809371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
29819371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2982ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2983fcdce8c4SStefano Zampini #else
29849566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
29859371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29869371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
29879371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2988fcdce8c4SStefano Zampini   c->nz = cnz;
2989fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
29909566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2991fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
29929566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2993fcdce8c4SStefano Zampini 
29949566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2995fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2996fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2997fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
29989371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
29999371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
30009371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3001fcdce8c4SStefano Zampini #endif
30029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
30039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3004fcdce8c4SStefano Zampini finalizesym:
3005fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3006fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3007fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
30089566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
30099566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3010fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3011fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3012fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3013fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3014fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3015fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3016fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3019fcdce8c4SStefano Zampini   } else {
3020fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3021fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
30229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
30239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3024fcdce8c4SStefano Zampini   }
3025fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3026fcdce8c4SStefano Zampini     PetscInt r = 0;
3027fcdce8c4SStefano Zampini     c->i[0]    = 0;
3028fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3029fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3030fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3031fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3032fcdce8c4SStefano Zampini     }
3033fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3034fcdce8c4SStefano Zampini   }
30359566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
30369566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
30379566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3038fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3039fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3040fcdce8c4SStefano Zampini   c->rmax          = 0;
3041fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3042fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3043fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3044fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3045fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3046fcdce8c4SStefano Zampini   }
30479566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
30489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3049fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3050fcdce8c4SStefano Zampini 
3051fcdce8c4SStefano Zampini   C->nonzerostate++;
30529566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
30539566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3054fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3055fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3056fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3057fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3058fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3059abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3060fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3061fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3062fcdce8c4SStefano Zampini   }
3063fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3064fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
3065fcdce8c4SStefano Zampini }
3066fcdce8c4SStefano Zampini 
3067fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3068fcdce8c4SStefano Zampini 
3069fcdce8c4SStefano Zampini /* handles sparse or dense B */
3070d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3071d71ae5a4SJacob Faibussowitsch {
3072fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3073fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3074fcdce8c4SStefano Zampini 
3075fcdce8c4SStefano Zampini   PetscFunctionBegin;
3076fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
30779566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
307848a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3079fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3080fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
308148a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3082fcdce8c4SStefano Zampini   }
308365e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
308465e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
308565e4b4d4SStefano Zampini     switch (product->type) {
308665e4b4d4SStefano Zampini     case MATPRODUCT_AB:
308765e4b4d4SStefano Zampini       if (product->api_user) {
3088d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
30899566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3090d0609cedSBarry Smith         PetscOptionsEnd();
309165e4b4d4SStefano Zampini       } else {
3092d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
30939566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3094d0609cedSBarry Smith         PetscOptionsEnd();
309565e4b4d4SStefano Zampini       }
309665e4b4d4SStefano Zampini       break;
309765e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
309865e4b4d4SStefano Zampini       if (product->api_user) {
3099d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
31009566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3101d0609cedSBarry Smith         PetscOptionsEnd();
310265e4b4d4SStefano Zampini       } else {
3103d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
31049566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3105d0609cedSBarry Smith         PetscOptionsEnd();
310665e4b4d4SStefano Zampini       }
310765e4b4d4SStefano Zampini       break;
310865e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
310965e4b4d4SStefano Zampini       if (product->api_user) {
3110d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
31119566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3112d0609cedSBarry Smith         PetscOptionsEnd();
311365e4b4d4SStefano Zampini       } else {
3114d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
31159566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3116d0609cedSBarry Smith         PetscOptionsEnd();
311765e4b4d4SStefano Zampini       }
311865e4b4d4SStefano Zampini       break;
311965e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
312065e4b4d4SStefano Zampini       if (product->api_user) {
3121d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
31229566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3123d0609cedSBarry Smith         PetscOptionsEnd();
312465e4b4d4SStefano Zampini       } else {
3125d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
31269566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3127d0609cedSBarry Smith         PetscOptionsEnd();
312865e4b4d4SStefano Zampini       }
312965e4b4d4SStefano Zampini       break;
313065e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
313165e4b4d4SStefano Zampini       if (product->api_user) {
3132d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
31339566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3134d0609cedSBarry Smith         PetscOptionsEnd();
313565e4b4d4SStefano Zampini       } else {
3136d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
31379566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3138d0609cedSBarry Smith         PetscOptionsEnd();
313965e4b4d4SStefano Zampini       }
314065e4b4d4SStefano Zampini       break;
3141d71ae5a4SJacob Faibussowitsch     default:
3142d71ae5a4SJacob Faibussowitsch       break;
314365e4b4d4SStefano Zampini     }
314465e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
314565e4b4d4SStefano Zampini   }
314665e4b4d4SStefano Zampini   /* dispatch */
3147fcdce8c4SStefano Zampini   if (isdense) {
3148ccdfe979SStefano Zampini     switch (product->type) {
3149ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3150ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3151ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3152ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3153ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3154fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
31559566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3156fcdce8c4SStefano Zampini       } else {
3157fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3158fcdce8c4SStefano Zampini       }
3159fcdce8c4SStefano Zampini       break;
3160d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3161d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3162d71ae5a4SJacob Faibussowitsch       break;
3163d71ae5a4SJacob Faibussowitsch     default:
3164d71ae5a4SJacob Faibussowitsch       break;
3165ccdfe979SStefano Zampini     }
3166fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3167fcdce8c4SStefano Zampini     switch (product->type) {
3168fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3169fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3170d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3171d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3172d71ae5a4SJacob Faibussowitsch       break;
3173fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3174fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3175d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3176d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3177d71ae5a4SJacob Faibussowitsch       break;
3178d71ae5a4SJacob Faibussowitsch     default:
3179d71ae5a4SJacob Faibussowitsch       break;
3180fcdce8c4SStefano Zampini     }
3181fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
31829566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3183fcdce8c4SStefano Zampini   }
3184ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3185ccdfe979SStefano Zampini }
3186ccdfe979SStefano Zampini 
3187d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3188d71ae5a4SJacob Faibussowitsch {
31899ae82921SPaul Mullowney   PetscFunctionBegin;
31909566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3191e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3192e6e9a74fSStefano Zampini }
3193e6e9a74fSStefano Zampini 
3194d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3195d71ae5a4SJacob Faibussowitsch {
3196e6e9a74fSStefano Zampini   PetscFunctionBegin;
31979566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3198e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3199e6e9a74fSStefano Zampini }
3200e6e9a74fSStefano Zampini 
3201d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3202d71ae5a4SJacob Faibussowitsch {
3203e6e9a74fSStefano Zampini   PetscFunctionBegin;
32049566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3205e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3206e6e9a74fSStefano Zampini }
3207e6e9a74fSStefano Zampini 
3208d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3209d71ae5a4SJacob Faibussowitsch {
3210e6e9a74fSStefano Zampini   PetscFunctionBegin;
32119566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
32129ae82921SPaul Mullowney   PetscFunctionReturn(0);
32139ae82921SPaul Mullowney }
32149ae82921SPaul Mullowney 
3215d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3216d71ae5a4SJacob Faibussowitsch {
3217ca45077fSPaul Mullowney   PetscFunctionBegin;
32189566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3219ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3220ca45077fSPaul Mullowney }
3221ca45077fSPaul Mullowney 
3222d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3223d71ae5a4SJacob Faibussowitsch {
3224a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3225a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3226a0e72f99SJunchao Zhang }
3227a0e72f99SJunchao Zhang 
3228afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3229d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3230d71ae5a4SJacob Faibussowitsch {
32319ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3232aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
32339ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3234e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3235e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3236e6e9a74fSStefano Zampini   PetscBool                     compressed;
3237afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3238afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3239afb2bd1cSJunchao Zhang #endif
32406e111a19SKarl Rupp 
32419ae82921SPaul Mullowney   PetscFunctionBegin;
324208401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3243cbc6b225SStefano Zampini   if (!a->nz) {
32449566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
32459566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3246e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3247e6e9a74fSStefano Zampini   }
324834d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
32499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3250e6e9a74fSStefano Zampini   if (!trans) {
32519ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
32525f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3253e6e9a74fSStefano Zampini   } else {
32541a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3255e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3256e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3257e6e9a74fSStefano Zampini     } else {
32589566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3259e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3260e6e9a74fSStefano Zampini     }
3261e6e9a74fSStefano Zampini   }
3262e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3263e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3264213423ffSJunchao Zhang 
3265e6e9a74fSStefano Zampini   try {
32669566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
32679566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
32689566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3269afb2bd1cSJunchao Zhang 
32709566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3271e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3272afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3273afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3274afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3275afb2bd1cSJunchao Zhang       */
3276e6e9a74fSStefano Zampini       xptr = xarray;
3277afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3278213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3279afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3280afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3281afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3282afb2bd1cSJunchao Zhang        */
3283afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3284afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3285afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3286afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3287afb2bd1cSJunchao Zhang       }
3288afb2bd1cSJunchao Zhang #endif
3289e6e9a74fSStefano Zampini     } else {
3290afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3291afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3292afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3293afb2bd1cSJunchao Zhang        */
3294afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3295e6e9a74fSStefano Zampini       dptr = zarray;
3296e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3297afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3298e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3299d0967f54SJacob Faibussowitsch 
3300d0967f54SJacob Faibussowitsch         thrust::for_each(
3301d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3302d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3303d0967f54SJacob Faibussowitsch #endif
3304d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
33059371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3306e6e9a74fSStefano Zampini       }
3307afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3308afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3309afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3310afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3311afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3312afb2bd1cSJunchao Zhang       }
3313afb2bd1cSJunchao Zhang #endif
3314e6e9a74fSStefano Zampini     }
33159ae82921SPaul Mullowney 
3316afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3317aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3318afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
33195f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3320afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
33219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
33229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
33239371c9d4SSatish Balay         PetscCallCUSPARSE(
33249371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
33259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3326afb2bd1cSJunchao Zhang 
3327afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3328afb2bd1cSJunchao Zhang       } else {
3329afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
33309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
33319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3332afb2bd1cSJunchao Zhang       }
3333afb2bd1cSJunchao Zhang 
33349371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
33359371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3336afb2bd1cSJunchao Zhang #else
33377656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
33389371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3339afb2bd1cSJunchao Zhang #endif
3340aa372e3fSPaul Mullowney     } else {
3341213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3342afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3343afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3344afb2bd1cSJunchao Zhang #else
3345301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
33469371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3347afb2bd1cSJunchao Zhang #endif
3348a65300a6SPaul Mullowney       }
3349aa372e3fSPaul Mullowney     }
33509566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3351aa372e3fSPaul Mullowney 
3352e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3353213423ffSJunchao Zhang       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3354213423ffSJunchao Zhang         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
33559566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3356e6e9a74fSStefano Zampini         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
33579566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
33587656d835SStefano Zampini         }
3359213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
33609566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz, 0));
33617656d835SStefano Zampini       }
33627656d835SStefano Zampini 
3363213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3364213423ffSJunchao Zhang       if (compressed) {
33659566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3366a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3367a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3368a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3369a0e72f99SJunchao Zhang          */
3370a0e72f99SJunchao Zhang #if 0
3371a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3372a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3373a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3374e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3375c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3376a0e72f99SJunchao Zhang #else
3377a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3378a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3379a0e72f99SJunchao Zhang #endif
33809566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3381e6e9a74fSStefano Zampini       }
3382e6e9a74fSStefano Zampini     } else {
33839371c9d4SSatish Balay       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3384e6e9a74fSStefano Zampini     }
33859566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
33869566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
33879566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3388d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3389d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3390d71ae5a4SJacob Faibussowitsch   }
3391e6e9a74fSStefano Zampini   if (yy) {
33929566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3393e6e9a74fSStefano Zampini   } else {
33949566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3395e6e9a74fSStefano Zampini   }
33969ae82921SPaul Mullowney   PetscFunctionReturn(0);
33979ae82921SPaul Mullowney }
33989ae82921SPaul Mullowney 
3399d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3400d71ae5a4SJacob Faibussowitsch {
3401ca45077fSPaul Mullowney   PetscFunctionBegin;
34029566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3403ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3404ca45077fSPaul Mullowney }
3405ca45077fSPaul Mullowney 
3406d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3407d71ae5a4SJacob Faibussowitsch {
3408042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3409042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
34103fa6b06aSMark Adams 
3411042217e8SBarry Smith   PetscFunctionBegin;
34129566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3413042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
34149566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
34159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3416042217e8SBarry Smith     cusp->deviceMat = NULL;
3417042217e8SBarry Smith   }
34189ae82921SPaul Mullowney   PetscFunctionReturn(0);
34199ae82921SPaul Mullowney }
34209ae82921SPaul Mullowney 
34219ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3422e057df02SPaul Mullowney /*@
342311a5261eSBarry Smith    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3424e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
342511a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3426e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3427e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3428e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
34299ae82921SPaul Mullowney 
3430d083f849SBarry Smith    Collective
34319ae82921SPaul Mullowney 
34329ae82921SPaul Mullowney    Input Parameters:
343311a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
34349ae82921SPaul Mullowney .  m - number of rows
34359ae82921SPaul Mullowney .  n - number of columns
34369ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
34379ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
34380298fd71SBarry Smith          (possibly different for each row) or NULL
34399ae82921SPaul Mullowney 
34409ae82921SPaul Mullowney    Output Parameter:
34419ae82921SPaul Mullowney .  A - the matrix
34429ae82921SPaul Mullowney 
344311a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
34449ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
344511a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
34469ae82921SPaul Mullowney 
34479ae82921SPaul Mullowney    Notes:
34489ae82921SPaul Mullowney    If nnz is given then nz is ignored
34499ae82921SPaul Mullowney 
345011a5261eSBarry Smith    The AIJ format, also called
345111a5261eSBarry Smith    compressed row storage, is fully compatible with standard Fortran 77
34529ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
34539ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
34549ae82921SPaul Mullowney 
34559ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
345611a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
34579ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
34589ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
34599ae82921SPaul Mullowney 
34609ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
34619ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
34629ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
34639ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
34649ae82921SPaul Mullowney 
34659ae82921SPaul Mullowney    Level: intermediate
34669ae82921SPaul Mullowney 
346711a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
34689ae82921SPaul Mullowney @*/
3469d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3470d71ae5a4SJacob Faibussowitsch {
34719ae82921SPaul Mullowney   PetscFunctionBegin;
34729566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
34739566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
34749566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
34759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
34769ae82921SPaul Mullowney   PetscFunctionReturn(0);
34779ae82921SPaul Mullowney }
34789ae82921SPaul Mullowney 
3479d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3480d71ae5a4SJacob Faibussowitsch {
34819ae82921SPaul Mullowney   PetscFunctionBegin;
34829ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
34839566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
34849ae82921SPaul Mullowney   } else {
34859566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3486aa372e3fSPaul Mullowney   }
34879566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
34889566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
34899566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
34909566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
34919566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
34929566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
34939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
34949566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
34959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
34969566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
34979566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
34989ae82921SPaul Mullowney   PetscFunctionReturn(0);
34999ae82921SPaul Mullowney }
35009ae82921SPaul Mullowney 
3501ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
350295639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3503d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3504d71ae5a4SJacob Faibussowitsch {
35059ff858a8SKarl Rupp   PetscFunctionBegin;
35069566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
35079566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
35089ff858a8SKarl Rupp   PetscFunctionReturn(0);
35099ff858a8SKarl Rupp }
35109ff858a8SKarl Rupp 
3511d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3512d71ae5a4SJacob Faibussowitsch {
3513a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3514039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3515039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3516039c6fbaSStefano Zampini   PetscScalar        *ay;
3517039c6fbaSStefano Zampini   const PetscScalar  *ax;
3518039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3519e6e9a74fSStefano Zampini 
352095639643SRichard Tran Mills   PetscFunctionBegin;
3521a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3522a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3523039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
35249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35259566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3526a587d139SMark     PetscFunctionReturn(0);
352795639643SRichard Tran Mills   }
3528039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
35299566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
35309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
35315f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
35325f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3533039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3534039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3535039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3536039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3537039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3538ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3539039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3540039c6fbaSStefano Zampini   }
3541d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3542d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3543039c6fbaSStefano Zampini 
3544039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3545039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3546039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3547039c6fbaSStefano Zampini     size_t bufferSize;
3548039c6fbaSStefano Zampini     void  *buffer;
3549039c6fbaSStefano Zampini #endif
3550039c6fbaSStefano Zampini 
35519566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35529566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35539566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3554039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
35559371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35569371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
35579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
35589566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35599371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35609371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
35619566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35629566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3564039c6fbaSStefano Zampini #else
35659566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35669371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
35679371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
35689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
35699566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3570039c6fbaSStefano Zampini #endif
35719566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
35729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3575039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3576a587d139SMark     cublasHandle_t cublasv2handle;
3577a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3578039c6fbaSStefano Zampini 
35799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
35809566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
35819566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
35829566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
35839566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
35849566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
35859566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
35869566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
35879566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
35889566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
35899566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3590039c6fbaSStefano Zampini   } else {
35919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
35929566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3593a587d139SMark   }
359495639643SRichard Tran Mills   PetscFunctionReturn(0);
359595639643SRichard Tran Mills }
359695639643SRichard Tran Mills 
3597d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3598d71ae5a4SJacob Faibussowitsch {
359933c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
360033c9ba73SStefano Zampini   PetscScalar   *ay;
360133c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
360233c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
360333c9ba73SStefano Zampini 
360433c9ba73SStefano Zampini   PetscFunctionBegin;
36059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
36069566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
36079566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
36089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
36099566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
36109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
36119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
36129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
361433c9ba73SStefano Zampini   PetscFunctionReturn(0);
361533c9ba73SStefano Zampini }
361633c9ba73SStefano Zampini 
3617d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3618d71ae5a4SJacob Faibussowitsch {
36197e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3620a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
36217e8381f9SStefano Zampini 
36223fa6b06aSMark Adams   PetscFunctionBegin;
36233fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
36243fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
36257e8381f9SStefano Zampini     if (spptr->mat) {
36267e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
36277e8381f9SStefano Zampini       if (matrix->values) {
36287e8381f9SStefano Zampini         both = PETSC_TRUE;
36297e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36307e8381f9SStefano Zampini       }
36317e8381f9SStefano Zampini     }
36327e8381f9SStefano Zampini     if (spptr->matTranspose) {
36337e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3634ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
36357e8381f9SStefano Zampini     }
36363fa6b06aSMark Adams   }
36379566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
36389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
36397e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3640a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
36413fa6b06aSMark Adams   PetscFunctionReturn(0);
36423fa6b06aSMark Adams }
36433fa6b06aSMark Adams 
3644d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3645d71ae5a4SJacob Faibussowitsch {
3646a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3647a587d139SMark 
3648a587d139SMark   PetscFunctionBegin;
36499a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
36509a14fc28SStefano Zampini     A->boundtocpu = flg;
36519a14fc28SStefano Zampini     PetscFunctionReturn(0);
36529a14fc28SStefano Zampini   }
3653a587d139SMark   if (flg) {
36549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3655a587d139SMark 
365633c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3657a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3658a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3659a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3660a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3661a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3662a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3663a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3664a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3665fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
36669566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
36679566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
36689566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
36699566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
36709566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
36719566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
36729566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3673a587d139SMark   } else {
367433c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3675a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3676a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3677a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3678a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3679a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3680a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3681a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3682a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3683fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
368467a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
368567a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
368667a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
368767a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
368867a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
368967a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
36907ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
36917ee59b9bSJunchao Zhang 
36929566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
36939566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36949566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
36959566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
36969566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
36979566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3698a587d139SMark   }
3699a587d139SMark   A->boundtocpu = flg;
3700ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3701ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3702ea500dcfSRichard Tran Mills   } else {
3703ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3704ea500dcfSRichard Tran Mills   }
3705a587d139SMark   PetscFunctionReturn(0);
3706a587d139SMark }
3707a587d139SMark 
3708d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3709d71ae5a4SJacob Faibussowitsch {
371049735bf3SStefano Zampini   Mat B;
37119ae82921SPaul Mullowney 
37129ae82921SPaul Mullowney   PetscFunctionBegin;
37139566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
371449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
37159566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
371649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
37179566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
371849735bf3SStefano Zampini   }
371949735bf3SStefano Zampini   B = *newmat;
372049735bf3SStefano Zampini 
37219566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
37229566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
372334136279SStefano Zampini 
372449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
37259ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3726e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
37279566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37289566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37299566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
37301a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
3731d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3732ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
3733a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3734a435da06SStefano Zampini   #else
3735d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3736a435da06SStefano Zampini   #endif
3737d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3738d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3739d8132acaSStefano Zampini #endif
37401a2c6b5cSJunchao Zhang       B->spptr = spptr;
37419ae82921SPaul Mullowney     } else {
3742e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3743e6e9a74fSStefano Zampini 
37449566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
37459566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
37469566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3747e6e9a74fSStefano Zampini       B->spptr = spptr;
37489ae82921SPaul Mullowney     }
3749e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
375049735bf3SStefano Zampini   }
3751693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
37529ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
37531a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
37549ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
375595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3756693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
37572205254eSKarl Rupp 
37589566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
37599566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
37609566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3761ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
37629566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3763ae48a8d0SStefano Zampini #endif
37649566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
37659ae82921SPaul Mullowney   PetscFunctionReturn(0);
37669ae82921SPaul Mullowney }
37679ae82921SPaul Mullowney 
3768d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3769d71ae5a4SJacob Faibussowitsch {
377002fe1965SBarry Smith   PetscFunctionBegin;
37719566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
37729566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
377302fe1965SBarry Smith   PetscFunctionReturn(0);
377402fe1965SBarry Smith }
377502fe1965SBarry Smith 
37763ca39a21SBarry Smith /*MC
3777e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3778e057df02SPaul Mullowney 
377911a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
378011a5261eSBarry Smith    CSR, ELL, or Hybrid format.
378111a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3782e057df02SPaul Mullowney 
3783e057df02SPaul Mullowney    Options Database Keys:
378411a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
378511a5261eSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
378611a5261eSBarry Smith -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
378711a5261eSBarry Smith +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3788e057df02SPaul Mullowney 
3789e057df02SPaul Mullowney   Level: beginner
3790e057df02SPaul Mullowney 
379111a5261eSBarry Smith .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3792e057df02SPaul Mullowney M*/
37937f756511SDominic Meiser 
3794bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
37950f39cd5aSBarry Smith 
3796d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3797d71ae5a4SJacob Faibussowitsch {
379842c9c57cSBarry Smith   PetscFunctionBegin;
37999566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
38009566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
38019566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
38029566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
38039566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3804bddcd29dSMark Adams 
380542c9c57cSBarry Smith   PetscFunctionReturn(0);
380642c9c57cSBarry Smith }
380729b38603SBarry Smith 
3808d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3809d71ae5a4SJacob Faibussowitsch {
3810cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3811cbc6b225SStefano Zampini 
3812cbc6b225SStefano Zampini   PetscFunctionBegin;
3813cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3814cbc6b225SStefano Zampini   delete cusp->cooPerm;
3815cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3816cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
3817cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3818cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
38199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
38209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3821cbc6b225SStefano Zampini   }
3822cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3823cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3824cbc6b225SStefano Zampini }
3825cbc6b225SStefano Zampini 
3826d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3827d71ae5a4SJacob Faibussowitsch {
38287f756511SDominic Meiser   PetscFunctionBegin;
38297f756511SDominic Meiser   if (*cusparsestruct) {
38309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
38319566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
38327f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
383381902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
38347e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
38357e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3836a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
38379566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
38389566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
38399566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
38409566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
38417f756511SDominic Meiser   }
38427f756511SDominic Meiser   PetscFunctionReturn(0);
38437f756511SDominic Meiser }
38447f756511SDominic Meiser 
3845d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3846d71ae5a4SJacob Faibussowitsch {
38477f756511SDominic Meiser   PetscFunctionBegin;
38487f756511SDominic Meiser   if (*mat) {
38497f756511SDominic Meiser     delete (*mat)->values;
38507f756511SDominic Meiser     delete (*mat)->column_indices;
38517f756511SDominic Meiser     delete (*mat)->row_offsets;
38527f756511SDominic Meiser     delete *mat;
38537f756511SDominic Meiser     *mat = 0;
38547f756511SDominic Meiser   }
38557f756511SDominic Meiser   PetscFunctionReturn(0);
38567f756511SDominic Meiser }
38577f756511SDominic Meiser 
3858d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3859d71ae5a4SJacob Faibussowitsch {
38607f756511SDominic Meiser   PetscFunctionBegin;
38617f756511SDominic Meiser   if (*trifactor) {
38629566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3863261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
38649566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
38659566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
38669566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3867afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
38689566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3869afb2bd1cSJunchao Zhang #endif
38709566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
38717f756511SDominic Meiser   }
38727f756511SDominic Meiser   PetscFunctionReturn(0);
38737f756511SDominic Meiser }
38747f756511SDominic Meiser 
3875d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3876d71ae5a4SJacob Faibussowitsch {
38777f756511SDominic Meiser   CsrMatrix *mat;
38787f756511SDominic Meiser 
38797f756511SDominic Meiser   PetscFunctionBegin;
38807f756511SDominic Meiser   if (*matstruct) {
38817f756511SDominic Meiser     if ((*matstruct)->mat) {
38827f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3883afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3884afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3885afb2bd1cSJunchao Zhang #else
38867f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
38879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3888afb2bd1cSJunchao Zhang #endif
38897f756511SDominic Meiser       } else {
38907f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
38917f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
38927f756511SDominic Meiser       }
38937f756511SDominic Meiser     }
38949566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
38957f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
38969566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
38979566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
38989566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3899afb2bd1cSJunchao Zhang 
3900afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3901afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
39029566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3903afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
3904afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
39059566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
39069566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
39079566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3908afb2bd1cSJunchao Zhang       }
3909afb2bd1cSJunchao Zhang     }
3910afb2bd1cSJunchao Zhang #endif
39117f756511SDominic Meiser     delete *matstruct;
39127e8381f9SStefano Zampini     *matstruct = NULL;
39137f756511SDominic Meiser   }
39147f756511SDominic Meiser   PetscFunctionReturn(0);
39157f756511SDominic Meiser }
39167f756511SDominic Meiser 
3917d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
3918d71ae5a4SJacob Faibussowitsch {
3919da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
3920da112707SJunchao Zhang 
39217f756511SDominic Meiser   PetscFunctionBegin;
3922da112707SJunchao Zhang   if (fs) {
3923da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
3924da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
3925da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
3926da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
3927da112707SJunchao Zhang     delete fs->rpermIndices;
3928da112707SJunchao Zhang     delete fs->cpermIndices;
3929da112707SJunchao Zhang     delete fs->workVector;
3930da112707SJunchao Zhang     fs->rpermIndices = NULL;
3931da112707SJunchao Zhang     fs->cpermIndices = NULL;
3932da112707SJunchao Zhang     fs->workVector   = NULL;
3933da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
3934da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
3935da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
3936da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
3937da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
3938da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
3939da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
3940da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
3941da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
394212ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
3943da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
3944da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
394512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
3946da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
3947da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
3948da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
3949da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
3950da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
3951da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
3952da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
3953da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
3954da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
3955da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
3956da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
3957da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
395812ba2bc6SJunchao Zhang 
395912ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
396012ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
3961da112707SJunchao Zhang #endif
3962ccdfe979SStefano Zampini   }
3963ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3964ccdfe979SStefano Zampini }
3965ccdfe979SStefano Zampini 
3966d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
3967d71ae5a4SJacob Faibussowitsch {
3968ccdfe979SStefano Zampini   cusparseHandle_t handle;
3969ccdfe979SStefano Zampini 
3970ccdfe979SStefano Zampini   PetscFunctionBegin;
3971ccdfe979SStefano Zampini   if (*trifactors) {
39729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
397348a46eb9SPierre Jolivet     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
39749566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
39757f756511SDominic Meiser   }
39767f756511SDominic Meiser   PetscFunctionReturn(0);
39777f756511SDominic Meiser }
39787e8381f9SStefano Zampini 
39799371c9d4SSatish Balay struct IJCompare {
3980d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3981d71ae5a4SJacob Faibussowitsch   {
39827e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
39837e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
39847e8381f9SStefano Zampini     return false;
39857e8381f9SStefano Zampini   }
39867e8381f9SStefano Zampini };
39877e8381f9SStefano Zampini 
39889371c9d4SSatish Balay struct IJEqual {
3989d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
3990d71ae5a4SJacob Faibussowitsch   {
39917e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
39927e8381f9SStefano Zampini     return true;
39937e8381f9SStefano Zampini   }
39947e8381f9SStefano Zampini };
39957e8381f9SStefano Zampini 
39969371c9d4SSatish Balay struct IJDiff {
39979371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
39987e8381f9SStefano Zampini };
39997e8381f9SStefano Zampini 
40009371c9d4SSatish Balay struct IJSum {
40019371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
40027e8381f9SStefano Zampini };
40037e8381f9SStefano Zampini 
40047e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
4005219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4006d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4007d71ae5a4SJacob Faibussowitsch {
40087e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4009fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4010bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
401108391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
40127e8381f9SStefano Zampini   CsrMatrix                            *matrix;
40137e8381f9SStefano Zampini   PetscInt                              n;
40147e8381f9SStefano Zampini 
40157e8381f9SStefano Zampini   PetscFunctionBegin;
401628b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
401728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
40187e8381f9SStefano Zampini   if (!cusp->cooPerm) {
40199566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
40209566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
40217e8381f9SStefano Zampini     PetscFunctionReturn(0);
40227e8381f9SStefano Zampini   }
40237e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
402428b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4025e61fc153SStefano Zampini   if (!v) {
4026e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4027e61fc153SStefano Zampini     goto finalize;
40287e8381f9SStefano Zampini   }
4029e61fc153SStefano Zampini   n = cusp->cooPerm->size();
403008391a17SStefano Zampini   if (isCudaMem(v)) {
403108391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
403208391a17SStefano Zampini   } else {
4033e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4034e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
403508391a17SStefano Zampini     d_v = cooPerm_v->data();
40369566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
403708391a17SStefano Zampini   }
40389566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4039e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4040ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4041bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
404208391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4043ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4044ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4045ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4046ddea5d60SJunchao Zhang       */
4047e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4048e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4049e61fc153SStefano Zampini       delete cooPerm_w;
40507e8381f9SStefano Zampini     } else {
4051ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
40529371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40539371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4054ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
40557e8381f9SStefano Zampini     }
40567e8381f9SStefano Zampini   } else {
4057e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
405808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4059e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
40607e8381f9SStefano Zampini     } else {
40619371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
40629371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
40637e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
40647e8381f9SStefano Zampini     }
40657e8381f9SStefano Zampini   }
40669566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4067e61fc153SStefano Zampini finalize:
4068e61fc153SStefano Zampini   delete cooPerm_v;
40697e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
40709566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4071fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
40729566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
40739566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
40749566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4075fcdce8c4SStefano Zampini   a->reallocs = 0;
4076fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4077fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4078fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4079fcdce8c4SStefano Zampini   A->num_ass++;
40807e8381f9SStefano Zampini   PetscFunctionReturn(0);
40817e8381f9SStefano Zampini }
40827e8381f9SStefano Zampini 
4083d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4084d71ae5a4SJacob Faibussowitsch {
4085a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4086a49f1ed0SStefano Zampini 
4087a49f1ed0SStefano Zampini   PetscFunctionBegin;
4088a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4089a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4090a49f1ed0SStefano Zampini   if (destroy) {
40919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4092a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4093a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4094a49f1ed0SStefano Zampini   }
40951a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
4096a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
4097a49f1ed0SStefano Zampini }
4098a49f1ed0SStefano Zampini 
40997e8381f9SStefano Zampini #include <thrust/binary_search.h>
4100219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4101d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4102d71ae5a4SJacob Faibussowitsch {
41037e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
41047e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
41057e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
41067e8381f9SStefano Zampini 
41077e8381f9SStefano Zampini   PetscFunctionBegin;
41089566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
41099566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
41107e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
41117e8381f9SStefano Zampini   if (n != cooPerm_n) {
41127e8381f9SStefano Zampini     delete cusp->cooPerm;
41137e8381f9SStefano Zampini     delete cusp->cooPerm_a;
41147e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
41157e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
41167e8381f9SStefano Zampini   }
41177e8381f9SStefano Zampini   if (n) {
4118e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4119e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4120e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4121e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4122e8729f6fSJunchao Zhang 
4123e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4124e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4125e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4126e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4127e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4128e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4129e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4130e8729f6fSJunchao Zhang     } else {
4131e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4132e8729f6fSJunchao Zhang     }
4133e8729f6fSJunchao Zhang 
4134e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4135e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4136e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4137e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4138e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4139e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4140e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4141e8729f6fSJunchao Zhang     } else {
4142e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4143e8729f6fSJunchao Zhang     }
4144e8729f6fSJunchao Zhang 
41457e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
41467e8381f9SStefano Zampini 
4147ad540459SPierre Jolivet     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4148ad540459SPierre Jolivet     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
41497e8381f9SStefano Zampini 
4150ddea5d60SJunchao Zhang     /* Ex.
4151ddea5d60SJunchao Zhang       n = 6
4152ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4153ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4154ddea5d60SJunchao Zhang     */
4155e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4156e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
41577e8381f9SStefano Zampini 
41589566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41597e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4160ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4161e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4162e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
41637e8381f9SStefano Zampini 
4164ddea5d60SJunchao Zhang     /*
4165ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4166ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4167ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4168ddea5d60SJunchao Zhang     */
4169ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4170ddea5d60SJunchao Zhang 
4171ddea5d60SJunchao Zhang     /*
4172ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4173ddea5d60SJunchao Zhang                             ^ekey
4174ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4175ddea5d60SJunchao Zhang                            ^nekye
4176ddea5d60SJunchao Zhang     */
41777e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
41787e8381f9SStefano Zampini       delete cusp->cooPerm_a;
41797e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4180ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4181ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4182ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4183ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4184ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
41857e8381f9SStefano Zampini       w[0]                  = 0;
4186ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4187ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
41887e8381f9SStefano Zampini     }
41897e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4190e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4191ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4192ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
41939566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41947e8381f9SStefano Zampini 
41959566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
41967e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
41977e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
41987e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
41999566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4200ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
42019566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
42027e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4203fcdce8c4SStefano Zampini     a->rmax          = 0;
42049566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
42059566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4206e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
42079566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
42089566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
42097e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
42107e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
42117e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
42127e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4213fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
42147e8381f9SStefano Zampini     }
4215fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
42167e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
42179566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
42189566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4219e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4220e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
42217e8381f9SStefano Zampini   } else {
42229566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
42237e8381f9SStefano Zampini   }
42249566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
42257e8381f9SStefano Zampini 
42267e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4227e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
42289566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
42299566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
42307e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
42319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
42329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
42337e8381f9SStefano Zampini   PetscFunctionReturn(0);
42347e8381f9SStefano Zampini }
4235ed502f03SStefano Zampini 
4236d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4237d71ae5a4SJacob Faibussowitsch {
4238219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4239219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4240cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4241219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4242219fbbafSJunchao Zhang 
4243219fbbafSJunchao Zhang   PetscFunctionBegin;
42449566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
42459566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4246219fbbafSJunchao Zhang   if (coo_i) {
42479566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4248219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4249219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
42509371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
42519371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
42529371c9d4SSatish Balay           break;
42539371c9d4SSatish Balay         }
4254219fbbafSJunchao Zhang       }
4255219fbbafSJunchao Zhang     }
4256219fbbafSJunchao Zhang   }
4257219fbbafSJunchao Zhang 
4258219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
42599566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4260219fbbafSJunchao Zhang   } else {
42619566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4262cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
42639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4264219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4265219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
42669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
42679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
42689566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
42699566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4270219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4271219fbbafSJunchao Zhang   }
4272219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4273219fbbafSJunchao Zhang }
4274219fbbafSJunchao Zhang 
4275d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4276d71ae5a4SJacob Faibussowitsch {
4277219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4278219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4279b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4280b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4281b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4282b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4283b6c38306SJunchao Zhang   }
4284219fbbafSJunchao Zhang }
4285219fbbafSJunchao Zhang 
4286d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4287d71ae5a4SJacob Faibussowitsch {
4288219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4289219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4290219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4291219fbbafSJunchao Zhang   PetscMemType        memtype;
4292219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4293219fbbafSJunchao Zhang   PetscScalar        *Aa;
4294219fbbafSJunchao Zhang 
4295219fbbafSJunchao Zhang   PetscFunctionBegin;
4296219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
42979566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4298219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
42999566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
43009566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4301219fbbafSJunchao Zhang     }
4302219fbbafSJunchao Zhang 
43039566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
43049566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4305219fbbafSJunchao Zhang 
4306cbc6b225SStefano Zampini     if (Annz) {
4307b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
43089566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4309cbc6b225SStefano Zampini     }
4310219fbbafSJunchao Zhang 
43119566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
43129566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4313219fbbafSJunchao Zhang 
43149566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4315219fbbafSJunchao Zhang   } else {
43169566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4317219fbbafSJunchao Zhang   }
4318219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4319219fbbafSJunchao Zhang }
4320219fbbafSJunchao Zhang 
43215b7e41feSStefano Zampini /*@C
432211a5261eSBarry Smith     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
43235b7e41feSStefano Zampini 
43245b7e41feSStefano Zampini    Not collective
43255b7e41feSStefano Zampini 
43265b7e41feSStefano Zampini     Input Parameters:
43275b7e41feSStefano Zampini +   A - the matrix
432811a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43295b7e41feSStefano Zampini 
43305b7e41feSStefano Zampini     Output Parameters:
43315b7e41feSStefano Zampini +   ia - the CSR row pointers
43325b7e41feSStefano Zampini -   ja - the CSR column indices
43335b7e41feSStefano Zampini 
43345b7e41feSStefano Zampini     Level: developer
43355b7e41feSStefano Zampini 
433611a5261eSBarry Smith     Note:
43375b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
43385b7e41feSStefano Zampini 
4339db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
43405b7e41feSStefano Zampini @*/
4341d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4342d71ae5a4SJacob Faibussowitsch {
43435f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
43445f101d05SStefano Zampini   CsrMatrix          *csr;
43455f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
43465f101d05SStefano Zampini 
43475f101d05SStefano Zampini   PetscFunctionBegin;
43485f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43495f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
43505f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4351aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
43529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
435328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
43545f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
43555f101d05SStefano Zampini   if (i) {
43565f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
43575f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
43585f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
43595f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
43609566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
43615f101d05SStefano Zampini       }
43625f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
43635f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
43645f101d05SStefano Zampini   }
43655f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
43665f101d05SStefano Zampini   PetscFunctionReturn(0);
43675f101d05SStefano Zampini }
43685f101d05SStefano Zampini 
43695b7e41feSStefano Zampini /*@C
437011a5261eSBarry Smith     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
43715b7e41feSStefano Zampini 
43725b7e41feSStefano Zampini    Not collective
43735b7e41feSStefano Zampini 
43745b7e41feSStefano Zampini     Input Parameters:
43755b7e41feSStefano Zampini +   A - the matrix
437611a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
43775b7e41feSStefano Zampini 
43785b7e41feSStefano Zampini     Output Parameters:
43795b7e41feSStefano Zampini +   ia - the CSR row pointers
43805b7e41feSStefano Zampini -   ja - the CSR column indices
43815b7e41feSStefano Zampini 
43825b7e41feSStefano Zampini     Level: developer
43835b7e41feSStefano Zampini 
4384db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
43855b7e41feSStefano Zampini @*/
4386d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4387d71ae5a4SJacob Faibussowitsch {
43885f101d05SStefano Zampini   PetscFunctionBegin;
43895f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
43905f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
43915f101d05SStefano Zampini   if (i) *i = NULL;
43925f101d05SStefano Zampini   if (j) *j = NULL;
43935f101d05SStefano Zampini   PetscFunctionReturn(0);
43945f101d05SStefano Zampini }
43955f101d05SStefano Zampini 
43965b7e41feSStefano Zampini /*@C
439711a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
43985b7e41feSStefano Zampini 
43995b7e41feSStefano Zampini    Not Collective
44005b7e41feSStefano Zampini 
44015b7e41feSStefano Zampini    Input Parameter:
440211a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44035b7e41feSStefano Zampini 
44045b7e41feSStefano Zampini    Output Parameter:
44055b7e41feSStefano Zampini .   a - pointer to the device data
44065b7e41feSStefano Zampini 
44075b7e41feSStefano Zampini    Level: developer
44085b7e41feSStefano Zampini 
440911a5261eSBarry Smith    Note:
441011a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44115b7e41feSStefano Zampini 
4412db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
44135b7e41feSStefano Zampini @*/
4414d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4415d71ae5a4SJacob Faibussowitsch {
4416ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4417ed502f03SStefano Zampini   CsrMatrix          *csr;
4418ed502f03SStefano Zampini 
4419ed502f03SStefano Zampini   PetscFunctionBegin;
4420ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4421ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4422ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4423aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
442528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4426ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
442728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4428ed502f03SStefano Zampini   *a = csr->values->data().get();
4429ed502f03SStefano Zampini   PetscFunctionReturn(0);
4430ed502f03SStefano Zampini }
4431ed502f03SStefano Zampini 
44325b7e41feSStefano Zampini /*@C
443311a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
44345b7e41feSStefano Zampini 
44355b7e41feSStefano Zampini    Not Collective
44365b7e41feSStefano Zampini 
44375b7e41feSStefano Zampini    Input Parameter:
443811a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44395b7e41feSStefano Zampini 
44405b7e41feSStefano Zampini    Output Parameter:
44415b7e41feSStefano Zampini .   a - pointer to the device data
44425b7e41feSStefano Zampini 
44435b7e41feSStefano Zampini    Level: developer
44445b7e41feSStefano Zampini 
4445db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
44465b7e41feSStefano Zampini @*/
4447d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4448d71ae5a4SJacob Faibussowitsch {
4449ed502f03SStefano Zampini   PetscFunctionBegin;
4450ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4451ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4452ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4453ed502f03SStefano Zampini   *a = NULL;
4454ed502f03SStefano Zampini   PetscFunctionReturn(0);
4455ed502f03SStefano Zampini }
4456ed502f03SStefano Zampini 
44575b7e41feSStefano Zampini /*@C
445811a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
44595b7e41feSStefano Zampini 
44605b7e41feSStefano Zampini    Not Collective
44615b7e41feSStefano Zampini 
44625b7e41feSStefano Zampini    Input Parameter:
446311a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
44645b7e41feSStefano Zampini 
44655b7e41feSStefano Zampini    Output Parameter:
44665b7e41feSStefano Zampini .   a - pointer to the device data
44675b7e41feSStefano Zampini 
44685b7e41feSStefano Zampini    Level: developer
44695b7e41feSStefano Zampini 
447011a5261eSBarry Smith    Note:
447111a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
44725b7e41feSStefano Zampini 
4473db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
44745b7e41feSStefano Zampini @*/
4475d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4476d71ae5a4SJacob Faibussowitsch {
4477039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4478039c6fbaSStefano Zampini   CsrMatrix          *csr;
4479039c6fbaSStefano Zampini 
4480039c6fbaSStefano Zampini   PetscFunctionBegin;
4481039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4482039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4483039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4484aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
448628b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4487039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
448828b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4489039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4490039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
44919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4492039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4493039c6fbaSStefano Zampini }
44945b7e41feSStefano Zampini /*@C
449511a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4496039c6fbaSStefano Zampini 
44975b7e41feSStefano Zampini    Not Collective
44985b7e41feSStefano Zampini 
44995b7e41feSStefano Zampini    Input Parameter:
450011a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45015b7e41feSStefano Zampini 
45025b7e41feSStefano Zampini    Output Parameter:
45035b7e41feSStefano Zampini .   a - pointer to the device data
45045b7e41feSStefano Zampini 
45055b7e41feSStefano Zampini    Level: developer
45065b7e41feSStefano Zampini 
4507db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
45085b7e41feSStefano Zampini @*/
4509d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4510d71ae5a4SJacob Faibussowitsch {
4511039c6fbaSStefano Zampini   PetscFunctionBegin;
4512039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4513039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4514039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45169566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4517039c6fbaSStefano Zampini   *a = NULL;
4518039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4519039c6fbaSStefano Zampini }
4520039c6fbaSStefano Zampini 
45215b7e41feSStefano Zampini /*@C
452211a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45235b7e41feSStefano Zampini 
45245b7e41feSStefano Zampini    Not Collective
45255b7e41feSStefano Zampini 
45265b7e41feSStefano Zampini    Input Parameter:
452711a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45285b7e41feSStefano Zampini 
45295b7e41feSStefano Zampini    Output Parameter:
45305b7e41feSStefano Zampini .   a - pointer to the device data
45315b7e41feSStefano Zampini 
45325b7e41feSStefano Zampini    Level: developer
45335b7e41feSStefano Zampini 
453411a5261eSBarry Smith    Note:
453511a5261eSBarry Smith    Does not trigger host-device copies and flags data validity on the GPU
45365b7e41feSStefano Zampini 
4537db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
45385b7e41feSStefano Zampini @*/
4539d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4540d71ae5a4SJacob Faibussowitsch {
4541ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4542ed502f03SStefano Zampini   CsrMatrix          *csr;
4543ed502f03SStefano Zampini 
4544ed502f03SStefano Zampini   PetscFunctionBegin;
4545ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4546ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4547ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4548aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
454928b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4550ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
455128b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4552ed502f03SStefano Zampini   *a             = csr->values->data().get();
4553039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4555ed502f03SStefano Zampini   PetscFunctionReturn(0);
4556ed502f03SStefano Zampini }
4557ed502f03SStefano Zampini 
45585b7e41feSStefano Zampini /*@C
455911a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
45605b7e41feSStefano Zampini 
45615b7e41feSStefano Zampini    Not Collective
45625b7e41feSStefano Zampini 
45635b7e41feSStefano Zampini    Input Parameter:
456411a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45655b7e41feSStefano Zampini 
45665b7e41feSStefano Zampini    Output Parameter:
45675b7e41feSStefano Zampini .   a - pointer to the device data
45685b7e41feSStefano Zampini 
45695b7e41feSStefano Zampini    Level: developer
45705b7e41feSStefano Zampini 
4571db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
45725b7e41feSStefano Zampini @*/
4573d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4574d71ae5a4SJacob Faibussowitsch {
4575ed502f03SStefano Zampini   PetscFunctionBegin;
4576ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4577ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4578ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
45799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
45809566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4581ed502f03SStefano Zampini   *a = NULL;
4582ed502f03SStefano Zampini   PetscFunctionReturn(0);
4583ed502f03SStefano Zampini }
4584ed502f03SStefano Zampini 
45859371c9d4SSatish Balay struct IJCompare4 {
4586d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4587d71ae5a4SJacob Faibussowitsch   {
4588ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4589ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4590ed502f03SStefano Zampini     return false;
4591ed502f03SStefano Zampini   }
4592ed502f03SStefano Zampini };
4593ed502f03SStefano Zampini 
45949371c9d4SSatish Balay struct Shift {
4595ed502f03SStefano Zampini   int _shift;
4596ed502f03SStefano Zampini 
4597ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
45989371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4599ed502f03SStefano Zampini };
4600ed502f03SStefano Zampini 
4601ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4602d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4603d71ae5a4SJacob Faibussowitsch {
4604ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4605ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4606ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4607ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4608ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4609ed502f03SStefano Zampini   cusparseStatus_t              stat;
4610ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4611ed502f03SStefano Zampini 
4612ed502f03SStefano Zampini   PetscFunctionBegin;
4613ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4614ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4615ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4616ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4617ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
46185f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
461908401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4620aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4621aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4622ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4623ed502f03SStefano Zampini     m = A->rmap->n;
4624ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
46259566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
46269566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
46279566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4628ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4629ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4630ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4631ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4632ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4633ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4634ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4635ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4636ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4637ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4638ed502f03SStefano Zampini     Ccusp->nrows            = m;
4639ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4640ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4641ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4642ed502f03SStefano Zampini     Ccsr->num_cols          = n;
46439566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
46449566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
46459566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
46469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
46479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
46489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
46499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46509566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
46529566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
465428b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
465528b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4656ed502f03SStefano Zampini 
4657ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4658ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4659ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4660ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4661ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4662ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4663ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4664ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4665ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4666ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4667ed502f03SStefano Zampini     if (c->nz) {
46682ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
46692ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
46702ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
46712ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
46722ed87e7eSStefano Zampini 
4673ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4674ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4675ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4676ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
46779566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4678ed502f03SStefano Zampini         }
46792ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
46802ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4681ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4682ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4683ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4684ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
46859566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4686ed502f03SStefano Zampini         }
46872ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
46882ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
46899566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
46909371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46919371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46929371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
46939371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
46942ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
46952ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
46962ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
46978909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4698ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4699ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
47008909a122SStefano Zampini #else
47018909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
47028909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
47038909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
47048909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
47058909a122SStefano Zampini #endif
47062ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
47072ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
47082ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
47092ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
47102ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
47112ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4712ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4713ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4714ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4715792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
47168909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
47178909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
47188909a122SStefano Zampini #endif
47192ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
47202ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
47212ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4722792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
47232ed87e7eSStefano Zampini #else
47242ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4725792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4726792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
47272ed87e7eSStefano Zampini #endif
47289371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47299371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47309566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
47312ed87e7eSStefano Zampini       delete wPerm;
47322ed87e7eSStefano Zampini       delete Acoo;
47332ed87e7eSStefano Zampini       delete Bcoo;
47342ed87e7eSStefano Zampini       delete Ccoo;
4735ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47369371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47379371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4738ed502f03SStefano Zampini #endif
47391a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
47409566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
47419566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4742ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4743ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4744ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4745ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4746ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4747ed502f03SStefano Zampini 
47481a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
47491a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4750a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4751ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4752ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4753ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4754ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4755ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4756ed502f03SStefano Zampini 
4757ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4758ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4759ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4760ed502f03SStefano Zampini 
47619566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4762ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4763ed502f03SStefano Zampini         if (AT) {
4764ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4765ed502f03SStefano Zampini           thrust::advance(rT, -1);
4766ed502f03SStefano Zampini         }
4767ed502f03SStefano Zampini         if (BT) {
4768ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4769ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4770ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4771ed502f03SStefano Zampini         }
4772ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4773ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4774ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4775ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4776ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4777ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
47789566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4779ed502f03SStefano Zampini 
47809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
47819566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
47829566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
47839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
47849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
47859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
47869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4789ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
47909371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
47919371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4792ed502f03SStefano Zampini #endif
4793ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4794ed502f03SStefano Zampini       }
4795ed502f03SStefano Zampini     }
4796ed502f03SStefano Zampini 
4797ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4798ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4799ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
48009566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
48019566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
4802ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4803ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4804ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4805ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4806ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
48079566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
48089566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4809ed502f03SStefano Zampini     } else {
48109566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
48119566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4812ed502f03SStefano Zampini     }
48139566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
48149566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
48159566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4816ed502f03SStefano Zampini     c->maxnz         = c->nz;
4817ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4818ed502f03SStefano Zampini     c->rmax          = 0;
4819ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4820ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4821ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4822ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4823ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4824ed502f03SStefano Zampini     }
48259566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
48269566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4827ed502f03SStefano Zampini     (*C)->nonzerostate++;
48289566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
48299566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4830ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4831ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4832ed502f03SStefano Zampini   } else {
483308401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4834ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4835ed502f03SStefano Zampini     if (c->nz) {
4836ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
48375f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4838aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
483908401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
48409566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
48419566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
48425f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
48435f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4844ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4845ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4846ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4847aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4848aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4849aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4850aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
48515f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4852ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4853ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
48549566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
48559371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
48569371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4857ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
48589371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
48599371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4860ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
48619566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
48621a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
48635f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4864ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4865ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4866ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4867ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4868ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4869ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4870ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48711a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4872ed502f03SStefano Zampini       }
48739566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4874ed502f03SStefano Zampini     }
4875ed502f03SStefano Zampini   }
48769566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4877ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4878ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4879ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4880ed502f03SStefano Zampini   PetscFunctionReturn(0);
4881ed502f03SStefano Zampini }
4882c215019aSStefano Zampini 
4883d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4884d71ae5a4SJacob Faibussowitsch {
4885c215019aSStefano Zampini   bool               dmem;
4886c215019aSStefano Zampini   const PetscScalar *av;
4887c215019aSStefano Zampini 
4888c215019aSStefano Zampini   PetscFunctionBegin;
4889c215019aSStefano Zampini   dmem = isCudaMem(v);
48909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4891c215019aSStefano Zampini   if (n && idx) {
4892c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4893c215019aSStefano Zampini     widx.assign(idx, idx + n);
48949566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
4895c215019aSStefano Zampini 
4896c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
4897c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4898c215019aSStefano Zampini     if (dmem) {
4899c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4900c215019aSStefano Zampini     } else {
4901c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
4902c215019aSStefano Zampini       dv = w->data();
4903c215019aSStefano Zampini     }
4904c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4905c215019aSStefano Zampini 
4906c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
4907c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
4908c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
490948a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
4910c215019aSStefano Zampini     delete w;
4911c215019aSStefano Zampini   } else {
49129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4913c215019aSStefano Zampini   }
49149566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
49159566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
4916c215019aSStefano Zampini   PetscFunctionReturn(0);
4917c215019aSStefano Zampini }
4918