xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16d0967f54SJacob Faibussowitsch #if PETSC_CPP_VERSION >= 14
17d0967f54SJacob Faibussowitsch   #define PETSC_HAVE_THRUST_ASYNC 1
18d0967f54SJacob Faibussowitsch   // thrust::for_each(thrust::cuda::par.on()) requires C++14
19a0e72f99SJunchao Zhang   #include <thrust/async/for_each.h>
20d0967f54SJacob Faibussowitsch #endif
21a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
22a2cee5feSJed Brown #include <thrust/remove.h>
23a2cee5feSJed Brown #include <thrust/sort.h>
24a2cee5feSJed Brown #include <thrust/unique.h>
25e8d2b73aSMark Adams 
26e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0};
27afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
28afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
29afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
30afb2bd1cSJunchao Zhang 
31afb2bd1cSJunchao Zhang   typedef enum {
32afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
36afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
37afb2bd1cSJunchao Zhang 
38afb2bd1cSJunchao Zhang   typedef enum {
39afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
40afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
41afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
42afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
43afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
47afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
48afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
49afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
50afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
51afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
52afb2bd1cSJunchao Zhang 
53afb2bd1cSJunchao Zhang   typedef enum {
54afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
55afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
56afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
57afb2bd1cSJunchao Zhang   */
58afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0};
59afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0};
60afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0};
61afb2bd1cSJunchao Zhang #endif
629ae82921SPaul Mullowney 
63087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
64087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *);
65087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
686fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *);
696fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *);
70087f3262SPaul Mullowney 
716fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec);
726fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
736fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
746fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec);
75dbbe0bcdSBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems *PetscOptionsObject);
76a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure);
7733c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar);
786fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec);
796fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
806fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
816fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
82e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec);
83e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec);
84e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool);
859ae82921SPaul Mullowney 
867f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **);
88470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat);
89470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **);
90470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **);
917f756511SDominic Meiser 
9257181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
93a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool);
9457181aedSStefano Zampini 
95c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]);
96e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]);
97219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode);
98c215019aSStefano Zampini 
99*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
100*d71ae5a4SJacob Faibussowitsch {
101aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1026e111a19SKarl Rupp 
103ca45077fSPaul Mullowney   PetscFunctionBegin;
104ca45077fSPaul Mullowney   switch (op) {
105*d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT:
106*d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
107*d71ae5a4SJacob Faibussowitsch     break;
108*d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_ALL:
109*d71ae5a4SJacob Faibussowitsch     cusparsestruct->format = format;
110*d71ae5a4SJacob Faibussowitsch     break;
111*d71ae5a4SJacob Faibussowitsch   default:
112*d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op);
113ca45077fSPaul Mullowney   }
114ca45077fSPaul Mullowney   PetscFunctionReturn(0);
115ca45077fSPaul Mullowney }
1169ae82921SPaul Mullowney 
117e057df02SPaul Mullowney /*@
11811a5261eSBarry Smith    MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular
11911a5261eSBarry Smith    operation. Only the `MatMult()` operation can use different GPU storage formats
12011a5261eSBarry Smith 
121e057df02SPaul Mullowney    Not Collective
122e057df02SPaul Mullowney 
123e057df02SPaul Mullowney    Input Parameters:
12411a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
12511a5261eSBarry Smith .  op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,
12611a5261eSBarry Smith         `MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`.
12711a5261eSBarry Smith -  format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.)
128e057df02SPaul Mullowney 
129e057df02SPaul Mullowney    Output Parameter:
130e057df02SPaul Mullowney 
131e057df02SPaul Mullowney    Level: intermediate
132e057df02SPaul Mullowney 
13311a5261eSBarry Smith .seealso: `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
134e057df02SPaul Mullowney @*/
135*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
136*d71ae5a4SJacob Faibussowitsch {
137e057df02SPaul Mullowney   PetscFunctionBegin;
138e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
139cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format));
140e057df02SPaul Mullowney   PetscFunctionReturn(0);
141e057df02SPaul Mullowney }
142e057df02SPaul Mullowney 
143*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu)
144*d71ae5a4SJacob Faibussowitsch {
145365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
146365b711fSMark Adams 
147365b711fSMark Adams   PetscFunctionBegin;
148365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
149365b711fSMark Adams   PetscFunctionReturn(0);
150365b711fSMark Adams }
151365b711fSMark Adams 
152365b711fSMark Adams /*@
15311a5261eSBarry Smith    MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`.
154365b711fSMark Adams 
155365b711fSMark Adams    Input Parameters:
15611a5261eSBarry Smith +  A - Matrix of type `MATSEQAIJCUSPARSE`
15711a5261eSBarry Smith -  use_cpu - set flag for using the built-in CPU `MatSolve()`
158365b711fSMark Adams 
159365b711fSMark Adams    Output Parameter:
160365b711fSMark Adams 
16111a5261eSBarry Smith    Note:
162365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
163365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
164365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
165365b711fSMark Adams 
166365b711fSMark Adams    Level: intermediate
167365b711fSMark Adams 
16811a5261eSBarry Smith .seealso: `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
169365b711fSMark Adams @*/
170*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu)
171*d71ae5a4SJacob Faibussowitsch {
172365b711fSMark Adams   PetscFunctionBegin;
173365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
174cac4c232SBarry Smith   PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu));
175365b711fSMark Adams   PetscFunctionReturn(0);
176365b711fSMark Adams }
177365b711fSMark Adams 
178*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg)
179*d71ae5a4SJacob Faibussowitsch {
180e6e9a74fSStefano Zampini   PetscFunctionBegin;
1811a2c6b5cSJunchao Zhang   switch (op) {
1821a2c6b5cSJunchao Zhang   case MAT_FORM_EXPLICIT_TRANSPOSE:
1831a2c6b5cSJunchao Zhang     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1849566063dSJacob Faibussowitsch     if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1851a2c6b5cSJunchao Zhang     A->form_explicit_transpose = flg;
1861a2c6b5cSJunchao Zhang     break;
187*d71ae5a4SJacob Faibussowitsch   default:
188*d71ae5a4SJacob Faibussowitsch     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
189*d71ae5a4SJacob Faibussowitsch     break;
190e6e9a74fSStefano Zampini   }
191e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
192e6e9a74fSStefano Zampini }
193e6e9a74fSStefano Zampini 
194bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
195bddcd29dSMark Adams 
196*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
197*d71ae5a4SJacob Faibussowitsch {
198bddcd29dSMark Adams   Mat_SeqAIJ         *b     = (Mat_SeqAIJ *)B->data;
199bddcd29dSMark Adams   IS                  isrow = b->row, iscol = b->col;
200bddcd29dSMark Adams   PetscBool           row_identity, col_identity;
201365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)B->spptr;
202bddcd29dSMark Adams 
203bddcd29dSMark Adams   PetscFunctionBegin;
2049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2059566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
206bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
207bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2089566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
2099566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
210f93f8571SJunchao Zhang 
211365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
212f93f8571SJunchao Zhang     if (row_identity && col_identity) {
213bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
214bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
215bddcd29dSMark Adams     } else {
216bddcd29dSMark Adams       B->ops->solve          = MatSolve_SeqAIJCUSPARSE;
217bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
218365b711fSMark Adams     }
219f93f8571SJunchao Zhang   }
220bddcd29dSMark Adams   B->ops->matsolve          = NULL;
221bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
222bddcd29dSMark Adams 
223bddcd29dSMark Adams   /* get the triangular factors */
22448a46eb9SPierre Jolivet   if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
225bddcd29dSMark Adams   PetscFunctionReturn(0);
226bddcd29dSMark Adams }
227bddcd29dSMark Adams 
228*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
229*d71ae5a4SJacob Faibussowitsch {
230e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2319ae82921SPaul Mullowney   PetscBool                flg;
232a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE      *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
2336e111a19SKarl Rupp 
2349ae82921SPaul Mullowney   PetscFunctionBegin;
235d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options");
2369ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2379371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2389566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format));
239afb2bd1cSJunchao Zhang 
2409371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg));
2419566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
2429566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg));
2439566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve));
244afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2459371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg));
246afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
248aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249a435da06SStefano Zampini   #else
250aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251a435da06SStefano Zampini   #endif
2529371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg));
253aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
254afb2bd1cSJunchao Zhang 
2559371c9d4SSatish Balay     PetscCall(
2569371c9d4SSatish Balay       PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg));
257aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
258afb2bd1cSJunchao Zhang #endif
2594c87dfd4SPaul Mullowney   }
260d0609cedSBarry Smith   PetscOptionsHeadEnd();
2619ae82921SPaul Mullowney   PetscFunctionReturn(0);
2629ae82921SPaul Mullowney }
2639ae82921SPaul Mullowney 
264*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
265*d71ae5a4SJacob Faibussowitsch {
2669ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
2679ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
2689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
269aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
2709ae82921SPaul Mullowney   const PetscInt                    *ai = a->i, *aj = a->j, *vi;
2719ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
2729ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2739ae82921SPaul Mullowney   PetscInt                           i, nz, nzLower, offset, rowOffset;
2749ae82921SPaul Mullowney 
2759ae82921SPaul Mullowney   PetscFunctionBegin;
276cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
277c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2789ae82921SPaul Mullowney     try {
2799ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2809ae82921SPaul Mullowney       nzLower = n + ai[n] - ai[1];
281da79fbbcSStefano Zampini       if (!loTriFactor) {
2822cbc15d9SMark         PetscScalar *AALo;
2832cbc15d9SMark 
2849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar)));
2859ae82921SPaul Mullowney 
2869ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt)));
2889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt)));
2899ae82921SPaul Mullowney 
2909ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2919ae82921SPaul Mullowney         AiLo[0]   = (PetscInt)0;
2929ae82921SPaul Mullowney         AiLo[n]   = nzLower;
2939ae82921SPaul Mullowney         AjLo[0]   = (PetscInt)0;
2949ae82921SPaul Mullowney         AALo[0]   = (MatScalar)1.0;
2959ae82921SPaul Mullowney         v         = aa;
2969ae82921SPaul Mullowney         vi        = aj;
2979ae82921SPaul Mullowney         offset    = 1;
2989ae82921SPaul Mullowney         rowOffset = 1;
2999ae82921SPaul Mullowney         for (i = 1; i < n; i++) {
3009ae82921SPaul Mullowney           nz = ai[i + 1] - ai[i];
301e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3029ae82921SPaul Mullowney           AiLo[i] = rowOffset;
3039ae82921SPaul Mullowney           rowOffset += nz + 1;
3049ae82921SPaul Mullowney 
3059566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
3069566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
3079ae82921SPaul Mullowney 
3089ae82921SPaul Mullowney           offset += nz;
3099ae82921SPaul Mullowney           AjLo[offset] = (PetscInt)i;
3109ae82921SPaul Mullowney           AALo[offset] = (MatScalar)1.0;
3119ae82921SPaul Mullowney           offset += 1;
3129ae82921SPaul Mullowney 
3139ae82921SPaul Mullowney           v += nz;
3149ae82921SPaul Mullowney           vi += nz;
3159ae82921SPaul Mullowney         }
3162205254eSKarl Rupp 
317aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3189566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
319da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
320aa372e3fSPaul Mullowney         /* Create the matrix description */
3219566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
325afb2bd1cSJunchao Zhang #else
3269566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
327afb2bd1cSJunchao Zhang #endif
3289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
330aa372e3fSPaul Mullowney 
331aa372e3fSPaul Mullowney         /* set the operation */
332aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
333aa372e3fSPaul Mullowney 
334aa372e3fSPaul Mullowney         /* set the matrix */
335aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
336aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = n;
337aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = n;
338aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
339aa372e3fSPaul Mullowney 
340aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
341aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1);
342aa372e3fSPaul Mullowney 
343aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
344aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower);
345aa372e3fSPaul Mullowney 
346aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
347aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + nzLower);
348aa372e3fSPaul Mullowney 
349afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3509566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
351261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3539371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3549371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
3559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
356afb2bd1cSJunchao Zhang #endif
357afb2bd1cSJunchao Zhang 
358aa372e3fSPaul Mullowney         /* perform the solve analysis */
3599371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
3609371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
3611b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
3629371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
363d49cd2b7SBarry Smith #else
3645f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
365afb2bd1cSJunchao Zhang #endif
3669566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3679566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
368aa372e3fSPaul Mullowney 
369da79fbbcSStefano Zampini         /* assign the pointer */
370aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
3712cbc15d9SMark         loTriFactor->AA_h                                          = AALo;
3729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3749566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar)));
375da79fbbcSStefano Zampini       } else { /* update values only */
37648a46eb9SPierre Jolivet         if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar)));
377da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3782cbc15d9SMark         loTriFactor->AA_h[0] = 1.0;
379da79fbbcSStefano Zampini         v                    = aa;
380da79fbbcSStefano Zampini         vi                   = aj;
381da79fbbcSStefano Zampini         offset               = 1;
382da79fbbcSStefano Zampini         for (i = 1; i < n; i++) {
383da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i];
3849566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
385da79fbbcSStefano Zampini           offset += nz;
3862cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
387da79fbbcSStefano Zampini           offset += 1;
388da79fbbcSStefano Zampini           v += nz;
389da79fbbcSStefano Zampini         }
3902cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower);
3919566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar)));
392da79fbbcSStefano Zampini       }
393*d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
394*d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
395*d71ae5a4SJacob Faibussowitsch     }
3969ae82921SPaul Mullowney   }
3979ae82921SPaul Mullowney   PetscFunctionReturn(0);
3989ae82921SPaul Mullowney }
3999ae82921SPaul Mullowney 
400*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
401*d71ae5a4SJacob Faibussowitsch {
4029ae82921SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
4039ae82921SPaul Mullowney   PetscInt                           n                  = A->rmap->n;
4049ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
405aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
4069ae82921SPaul Mullowney   const PetscInt                    *aj = a->j, *adiag = a->diag, *vi;
4079ae82921SPaul Mullowney   const MatScalar                   *aa = a->a, *v;
4089ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4099ae82921SPaul Mullowney   PetscInt                           i, nz, nzUpper, offset;
4109ae82921SPaul Mullowney 
4119ae82921SPaul Mullowney   PetscFunctionBegin;
412cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
413c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4149ae82921SPaul Mullowney     try {
4159ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4169ae82921SPaul Mullowney       nzUpper = adiag[0] - adiag[n];
417da79fbbcSStefano Zampini       if (!upTriFactor) {
4182cbc15d9SMark         PetscScalar *AAUp;
4192cbc15d9SMark 
4209566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
4212cbc15d9SMark 
4229ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4239566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
4249566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
4259ae82921SPaul Mullowney 
4269ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4279ae82921SPaul Mullowney         AiUp[0] = (PetscInt)0;
4289ae82921SPaul Mullowney         AiUp[n] = nzUpper;
4299ae82921SPaul Mullowney         offset  = nzUpper;
4309ae82921SPaul Mullowney         for (i = n - 1; i >= 0; i--) {
4319ae82921SPaul Mullowney           v  = aa + adiag[i + 1] + 1;
4329ae82921SPaul Mullowney           vi = aj + adiag[i + 1] + 1;
4339ae82921SPaul Mullowney 
434e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4359ae82921SPaul Mullowney           nz = adiag[i] - adiag[i + 1] - 1;
4369ae82921SPaul Mullowney 
437e057df02SPaul Mullowney           /* decrement the offset */
4389ae82921SPaul Mullowney           offset -= (nz + 1);
4399ae82921SPaul Mullowney 
440e057df02SPaul Mullowney           /* first, set the diagonal elements */
4419ae82921SPaul Mullowney           AjUp[offset] = (PetscInt)i;
44209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1. / v[nz];
4439ae82921SPaul Mullowney           AiUp[i]      = AiUp[i + 1] - (nz + 1);
4449ae82921SPaul Mullowney 
4459566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset + 1]), vi, nz));
4469566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset + 1]), v, nz));
4479ae82921SPaul Mullowney         }
4482205254eSKarl Rupp 
449aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4509566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
451da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4522205254eSKarl Rupp 
453aa372e3fSPaul Mullowney         /* Create the matrix description */
4549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4559566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
458afb2bd1cSJunchao Zhang #else
4599566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
460afb2bd1cSJunchao Zhang #endif
4619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
463aa372e3fSPaul Mullowney 
464aa372e3fSPaul Mullowney         /* set the operation */
465aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
466aa372e3fSPaul Mullowney 
467aa372e3fSPaul Mullowney         /* set the matrix */
468aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
469aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = n;
470aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = n;
471aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
472aa372e3fSPaul Mullowney 
473aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1);
474aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1);
475aa372e3fSPaul Mullowney 
476aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
477aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper);
478aa372e3fSPaul Mullowney 
479aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
480aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper);
481aa372e3fSPaul Mullowney 
482afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4839566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
484261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4869371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4879371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
4889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
489afb2bd1cSJunchao Zhang #endif
490afb2bd1cSJunchao Zhang 
491aa372e3fSPaul Mullowney         /* perform the solve analysis */
4929371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
4939371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
4941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
4959371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
496d49cd2b7SBarry Smith #else
4975f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
498afb2bd1cSJunchao Zhang #endif
4999566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5009566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
501aa372e3fSPaul Mullowney 
502da79fbbcSStefano Zampini         /* assign the pointer */
503aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
5042cbc15d9SMark         upTriFactor->AA_h                                          = AAUp;
5059566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5069566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5079566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar)));
508da79fbbcSStefano Zampini       } else {
50948a46eb9SPierre Jolivet         if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar)));
510da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
511da79fbbcSStefano Zampini         offset = nzUpper;
512da79fbbcSStefano Zampini         for (i = n - 1; i >= 0; i--) {
513da79fbbcSStefano Zampini           v = aa + adiag[i + 1] + 1;
514da79fbbcSStefano Zampini 
515da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
516da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i + 1] - 1;
517da79fbbcSStefano Zampini 
518da79fbbcSStefano Zampini           /* decrement the offset */
519da79fbbcSStefano Zampini           offset -= (nz + 1);
520da79fbbcSStefano Zampini 
521da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5222cbc15d9SMark           upTriFactor->AA_h[offset] = 1. / v[nz];
5239566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset + 1]), v, nz));
524da79fbbcSStefano Zampini         }
5252cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper);
5269566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar)));
527da79fbbcSStefano Zampini       }
528*d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
529*d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
530*d71ae5a4SJacob Faibussowitsch     }
5319ae82921SPaul Mullowney   }
5329ae82921SPaul Mullowney   PetscFunctionReturn(0);
5339ae82921SPaul Mullowney }
5349ae82921SPaul Mullowney 
535*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
536*d71ae5a4SJacob Faibussowitsch {
5379ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
5389ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
5399ae82921SPaul Mullowney   IS                            isrow = a->row, iscol = a->icol;
5409ae82921SPaul Mullowney   PetscBool                     row_identity, col_identity;
5419ae82921SPaul Mullowney   PetscInt                      n = A->rmap->n;
5429ae82921SPaul Mullowney 
5439ae82921SPaul Mullowney   PetscFunctionBegin;
54428b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
5459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5472205254eSKarl Rupp 
548ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
549aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = a->nz;
5509ae82921SPaul Mullowney 
551c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
552e057df02SPaul Mullowney   /* lower triangular indices */
5539566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow, &row_identity));
554da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
555da79fbbcSStefano Zampini     const PetscInt *r;
556da79fbbcSStefano Zampini 
5579566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow, &r));
558aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
559aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r + n);
5609566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow, &r));
5619566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
562da79fbbcSStefano Zampini   }
5639ae82921SPaul Mullowney 
564e057df02SPaul Mullowney   /* upper triangular indices */
5659566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol, &col_identity));
566da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
567da79fbbcSStefano Zampini     const PetscInt *c;
568da79fbbcSStefano Zampini 
5699566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol, &c));
570aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
571aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c + n);
5729566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol, &c));
5739566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
574da79fbbcSStefano Zampini   }
5759ae82921SPaul Mullowney   PetscFunctionReturn(0);
5769ae82921SPaul Mullowney }
5779ae82921SPaul Mullowney 
578*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
579*d71ae5a4SJacob Faibussowitsch {
580087f3262SPaul Mullowney   Mat_SeqAIJ                        *a                  = (Mat_SeqAIJ *)A->data;
581087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
582aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
583aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
584087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
585087f3262SPaul Mullowney   PetscScalar                       *AAUp;
586087f3262SPaul Mullowney   PetscScalar                       *AALo;
587087f3262SPaul Mullowney   PetscInt                           nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j;
588087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b  = (Mat_SeqSBAIJ *)A->data;
589087f3262SPaul Mullowney   const PetscInt                    *ai = b->i, *aj = b->j, *vj;
590087f3262SPaul Mullowney   const MatScalar                   *aa = b->a, *v;
591087f3262SPaul Mullowney 
592087f3262SPaul Mullowney   PetscFunctionBegin;
593cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
594c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
595087f3262SPaul Mullowney     try {
5969566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar)));
5979566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar)));
598da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
599087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
6009566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt)));
6019566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt)));
602087f3262SPaul Mullowney 
603087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
604087f3262SPaul Mullowney         AiUp[0] = (PetscInt)0;
605087f3262SPaul Mullowney         AiUp[n] = nzUpper;
606087f3262SPaul Mullowney         offset  = 0;
607087f3262SPaul Mullowney         for (i = 0; i < n; i++) {
608087f3262SPaul Mullowney           /* set the pointers */
609087f3262SPaul Mullowney           v  = aa + ai[i];
610087f3262SPaul Mullowney           vj = aj + ai[i];
611087f3262SPaul Mullowney           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
612087f3262SPaul Mullowney 
613087f3262SPaul Mullowney           /* first, set the diagonal elements */
614087f3262SPaul Mullowney           AjUp[offset] = (PetscInt)i;
61509f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0 / v[nz];
616087f3262SPaul Mullowney           AiUp[i]      = offset;
61709f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0 / v[nz];
618087f3262SPaul Mullowney 
619087f3262SPaul Mullowney           offset += 1;
620087f3262SPaul Mullowney           if (nz > 0) {
6219566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
6229566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
623087f3262SPaul Mullowney             for (j = offset; j < offset + nz; j++) {
624087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
625087f3262SPaul Mullowney               AALo[j] = AAUp[j] / v[nz];
626087f3262SPaul Mullowney             }
627087f3262SPaul Mullowney             offset += nz;
628087f3262SPaul Mullowney           }
629087f3262SPaul Mullowney         }
630087f3262SPaul Mullowney 
631aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6329566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
633da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
634087f3262SPaul Mullowney 
635aa372e3fSPaul Mullowney         /* Create the matrix description */
6369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6379566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
640afb2bd1cSJunchao Zhang #else
6419566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
642afb2bd1cSJunchao Zhang #endif
6439566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6449566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
645087f3262SPaul Mullowney 
646aa372e3fSPaul Mullowney         /* set the matrix */
647aa372e3fSPaul Mullowney         upTriFactor->csrMat              = new CsrMatrix;
648aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows    = A->rmap->n;
649aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols    = A->cmap->n;
650aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
651aa372e3fSPaul Mullowney 
652aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
653aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
654aa372e3fSPaul Mullowney 
655aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
656aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
657aa372e3fSPaul Mullowney 
658aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
659aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
660aa372e3fSPaul Mullowney 
661afb2bd1cSJunchao Zhang         /* set the operation */
662afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
663afb2bd1cSJunchao Zhang 
664afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6659566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
666261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6671b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6689371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6699371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize));
6709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize));
671afb2bd1cSJunchao Zhang #endif
672afb2bd1cSJunchao Zhang 
673aa372e3fSPaul Mullowney         /* perform the solve analysis */
6749371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(),
6759371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(),
6761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6779371c9d4SSatish Balay                                                   upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer));
678d49cd2b7SBarry Smith #else
6795f80ce2aSJacob Faibussowitsch                                                   upTriFactor->solveInfo));
680afb2bd1cSJunchao Zhang #endif
6819566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6829566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
683aa372e3fSPaul Mullowney 
684da79fbbcSStefano Zampini         /* assign the pointer */
685aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor;
686aa372e3fSPaul Mullowney 
687aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6889566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
689da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
690aa372e3fSPaul Mullowney 
691aa372e3fSPaul Mullowney         /* Create the matrix description */
6929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
6939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
6959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
696afb2bd1cSJunchao Zhang #else
6979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
698afb2bd1cSJunchao Zhang #endif
6999566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
7009566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
701aa372e3fSPaul Mullowney 
702aa372e3fSPaul Mullowney         /* set the operation */
703aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
704aa372e3fSPaul Mullowney 
705aa372e3fSPaul Mullowney         /* set the matrix */
706aa372e3fSPaul Mullowney         loTriFactor->csrMat              = new CsrMatrix;
707aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows    = A->rmap->n;
708aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols    = A->cmap->n;
709aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
710aa372e3fSPaul Mullowney 
711aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
712aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1);
713aa372e3fSPaul Mullowney 
714aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
715aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz);
716aa372e3fSPaul Mullowney 
717aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
718aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
719aa372e3fSPaul Mullowney 
720afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7219566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
722261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
7231b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7249371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7259371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize));
7269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize));
727afb2bd1cSJunchao Zhang #endif
728afb2bd1cSJunchao Zhang 
729aa372e3fSPaul Mullowney         /* perform the solve analysis */
7309371c9d4SSatish Balay         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(),
7319371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(),
7321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
7339371c9d4SSatish Balay                                                   loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer));
734d49cd2b7SBarry Smith #else
7355f80ce2aSJacob Faibussowitsch                                                   loTriFactor->solveInfo));
736afb2bd1cSJunchao Zhang #endif
7379566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7389566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
739aa372e3fSPaul Mullowney 
740da79fbbcSStefano Zampini         /* assign the pointer */
741aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor;
742087f3262SPaul Mullowney 
7439566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar))));
7449566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7459566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
746da79fbbcSStefano Zampini       } else {
747da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
748da79fbbcSStefano Zampini         offset = 0;
749da79fbbcSStefano Zampini         for (i = 0; i < n; i++) {
750da79fbbcSStefano Zampini           /* set the pointers */
751da79fbbcSStefano Zampini           v  = aa + ai[i];
752da79fbbcSStefano Zampini           nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
753da79fbbcSStefano Zampini 
754da79fbbcSStefano Zampini           /* first, set the diagonal elements */
755da79fbbcSStefano Zampini           AAUp[offset] = 1.0 / v[nz];
756da79fbbcSStefano Zampini           AALo[offset] = 1.0 / v[nz];
757da79fbbcSStefano Zampini 
758da79fbbcSStefano Zampini           offset += 1;
759da79fbbcSStefano Zampini           if (nz > 0) {
7609566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
761da79fbbcSStefano Zampini             for (j = offset; j < offset + nz; j++) {
762da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
763da79fbbcSStefano Zampini               AALo[j] = AAUp[j] / v[nz];
764da79fbbcSStefano Zampini             }
765da79fbbcSStefano Zampini             offset += nz;
766da79fbbcSStefano Zampini           }
767da79fbbcSStefano Zampini         }
76828b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
76928b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
770da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz);
771da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo + a->nz);
7729566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar)));
773da79fbbcSStefano Zampini       }
7749566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
7759566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
776*d71ae5a4SJacob Faibussowitsch     } catch (char *ex) {
777*d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
778*d71ae5a4SJacob Faibussowitsch     }
779087f3262SPaul Mullowney   }
780087f3262SPaul Mullowney   PetscFunctionReturn(0);
781087f3262SPaul Mullowney }
782087f3262SPaul Mullowney 
783*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
784*d71ae5a4SJacob Faibussowitsch {
785087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ *)A->data;
786087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
787087f3262SPaul Mullowney   IS                            ip                 = a->row;
788087f3262SPaul Mullowney   PetscBool                     perm_identity;
789087f3262SPaul Mullowney   PetscInt                      n = A->rmap->n;
790087f3262SPaul Mullowney 
791087f3262SPaul Mullowney   PetscFunctionBegin;
79228b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors");
7939566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
794ad540459SPierre Jolivet   if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n);
795aa372e3fSPaul Mullowney   cusparseTriFactors->nnz = (a->nz - n) * 2 + n;
796aa372e3fSPaul Mullowney 
797da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
798da79fbbcSStefano Zampini 
799087f3262SPaul Mullowney   /* lower triangular indices */
8009566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
801087f3262SPaul Mullowney   if (!perm_identity) {
8024e4bbfaaSStefano Zampini     IS              iip;
803da79fbbcSStefano Zampini     const PetscInt *irip, *rip;
8044e4bbfaaSStefano Zampini 
8059566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip));
8069566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip, &irip));
8079566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip, &rip));
808aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
809aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip + n);
810aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
8114e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip + n);
8129566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip, &irip));
8139566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
8149566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip, &rip));
8159566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
816da79fbbcSStefano Zampini   }
817087f3262SPaul Mullowney   PetscFunctionReturn(0);
818087f3262SPaul Mullowney }
819087f3262SPaul Mullowney 
820*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info)
821*d71ae5a4SJacob Faibussowitsch {
822087f3262SPaul Mullowney   Mat_SeqAIJ *b  = (Mat_SeqAIJ *)B->data;
823087f3262SPaul Mullowney   IS          ip = b->row;
824087f3262SPaul Mullowney   PetscBool   perm_identity;
825087f3262SPaul Mullowney 
826087f3262SPaul Mullowney   PetscFunctionBegin;
8279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
8289566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
829ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
830087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8319566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip, &perm_identity));
832087f3262SPaul Mullowney   if (perm_identity) {
833087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
834087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8354e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8364e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
837087f3262SPaul Mullowney   } else {
838087f3262SPaul Mullowney     B->ops->solve             = MatSolve_SeqAIJCUSPARSE;
839087f3262SPaul Mullowney     B->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE;
8404e4bbfaaSStefano Zampini     B->ops->matsolve          = NULL;
8414e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
842087f3262SPaul Mullowney   }
843087f3262SPaul Mullowney 
844087f3262SPaul Mullowney   /* get the triangular factors */
8459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
846087f3262SPaul Mullowney   PetscFunctionReturn(0);
847087f3262SPaul Mullowney }
8489ae82921SPaul Mullowney 
849*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
850*d71ae5a4SJacob Faibussowitsch {
851bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
852aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
853aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
854da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
855da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
856aa372e3fSPaul Mullowney   cusparseIndexBase_t                indexBase;
857aa372e3fSPaul Mullowney   cusparseMatrixType_t               matrixType;
858aa372e3fSPaul Mullowney   cusparseFillMode_t                 fillMode;
859aa372e3fSPaul Mullowney   cusparseDiagType_t                 diagType;
860b175d8bbSPaul Mullowney 
861bda325fcSPaul Mullowney   PetscFunctionBegin;
862aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8639566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
864da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
865aa372e3fSPaul Mullowney 
866aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
867aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
868aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(loTriFactor->descr);
8699371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
870aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(loTriFactor->descr);
871aa372e3fSPaul Mullowney 
872aa372e3fSPaul Mullowney   /* Create the matrix description */
8739566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
8749566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
8759566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
8769566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
8779566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
878aa372e3fSPaul Mullowney 
879aa372e3fSPaul Mullowney   /* set the operation */
880aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
881aa372e3fSPaul Mullowney 
882aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
883aa372e3fSPaul Mullowney   loTriFactorT->csrMat                 = new CsrMatrix;
884afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
885afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
886aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
887afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1);
888afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
889afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
890aa372e3fSPaul Mullowney 
891aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
892afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
8939371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(),
8949371c9d4SSatish Balay                                                   loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
8959371c9d4SSatish Balay                                                   loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
8969566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize));
897afb2bd1cSJunchao Zhang #endif
898afb2bd1cSJunchao Zhang 
8999566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9009371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
9019371c9d4SSatish Balay                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(),
902afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9039371c9d4SSatish Balay                                      loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
904afb2bd1cSJunchao Zhang #else
9059371c9d4SSatish Balay                                      loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
906afb2bd1cSJunchao Zhang #endif
9079566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9089566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
909aa372e3fSPaul Mullowney 
910afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9119566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
912261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
9131b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9149371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
9159371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize));
9169566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize));
917afb2bd1cSJunchao Zhang #endif
918afb2bd1cSJunchao Zhang 
919afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9209371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(),
9219371c9d4SSatish Balay                                             loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(),
9221b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9239371c9d4SSatish Balay                                             loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
924d49cd2b7SBarry Smith #else
9255f80ce2aSJacob Faibussowitsch                                             loTriFactorT->solveInfo));
926afb2bd1cSJunchao Zhang #endif
9279566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9289566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
929aa372e3fSPaul Mullowney 
930da79fbbcSStefano Zampini   /* assign the pointer */
931aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
932aa372e3fSPaul Mullowney 
933aa372e3fSPaul Mullowney   /*********************************************/
934aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
935aa372e3fSPaul Mullowney   /*********************************************/
936aa372e3fSPaul Mullowney 
937aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9389566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
939da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
940aa372e3fSPaul Mullowney 
941aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
942aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
943aa372e3fSPaul Mullowney   indexBase  = cusparseGetMatIndexBase(upTriFactor->descr);
9449371c9d4SSatish Balay   fillMode   = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
945aa372e3fSPaul Mullowney   diagType   = cusparseGetMatDiagType(upTriFactor->descr);
946aa372e3fSPaul Mullowney 
947aa372e3fSPaul Mullowney   /* Create the matrix description */
9489566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9499566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
9509566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
9519566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
9529566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
953aa372e3fSPaul Mullowney 
954aa372e3fSPaul Mullowney   /* set the operation */
955aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
956aa372e3fSPaul Mullowney 
957aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
958aa372e3fSPaul Mullowney   upTriFactorT->csrMat                 = new CsrMatrix;
959afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
960afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
961aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
962afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1);
963afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
964afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
965aa372e3fSPaul Mullowney 
966aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
967afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9689371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(),
9699371c9d4SSatish Balay                                                   upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
9709371c9d4SSatish Balay                                                   upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
9719566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize));
972afb2bd1cSJunchao Zhang #endif
973afb2bd1cSJunchao Zhang 
9749566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
9759371c9d4SSatish Balay   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
9769371c9d4SSatish Balay                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(),
977afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
9789371c9d4SSatish Balay                                      upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
979afb2bd1cSJunchao Zhang #else
9809371c9d4SSatish Balay                                      upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase));
981afb2bd1cSJunchao Zhang #endif
982d49cd2b7SBarry Smith 
9839566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9849566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
985aa372e3fSPaul Mullowney 
986afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9879566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
988261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
9891b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
9909371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9919371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize));
9929566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize));
993afb2bd1cSJunchao Zhang #endif
994afb2bd1cSJunchao Zhang 
995afb2bd1cSJunchao Zhang   /* perform the solve analysis */
9965f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
9979371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(),
9989371c9d4SSatish Balay                                             upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(),
9991b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
10009371c9d4SSatish Balay                                             upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1001d49cd2b7SBarry Smith #else
10025f80ce2aSJacob Faibussowitsch                                             upTriFactorT->solveInfo));
1003afb2bd1cSJunchao Zhang #endif
1004d49cd2b7SBarry Smith 
10059566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10069566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0));
1007aa372e3fSPaul Mullowney 
1008da79fbbcSStefano Zampini   /* assign the pointer */
1009aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1010bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1011bda325fcSPaul Mullowney }
1012bda325fcSPaul Mullowney 
10139371c9d4SSatish Balay struct PetscScalarToPetscInt {
10149371c9d4SSatish Balay   __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); }
1015a49f1ed0SStefano Zampini };
1016a49f1ed0SStefano Zampini 
1017*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1018*d71ae5a4SJacob Faibussowitsch {
1019aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
1020a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1021bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data;
1022bda325fcSPaul Mullowney   cusparseStatus_t              stat;
1023aa372e3fSPaul Mullowney   cusparseIndexBase_t           indexBase;
1024b175d8bbSPaul Mullowney 
1025bda325fcSPaul Mullowney   PetscFunctionBegin;
10269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1027a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
102828b400f6SJacob Faibussowitsch   PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct");
1029a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
103008401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct");
10311a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
10329566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
10339566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
103448a46eb9SPierre Jolivet   if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
1035a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1036aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
10379566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1038aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
10399566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
10409566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1041aa372e3fSPaul Mullowney 
1042b06137fdSPaul Mullowney     /* set alpha and beta */
10439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar)));
10449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero), sizeof(PetscScalar)));
10459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
10469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
10489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
1049b06137fdSPaul Mullowney 
1050aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1051aa372e3fSPaul Mullowney       CsrMatrix *matrixT      = new CsrMatrix;
1052a49f1ed0SStefano Zampini       matstructT->mat         = matrixT;
1053554b8892SKarl Rupp       matrixT->num_rows       = A->cmap->n;
1054554b8892SKarl Rupp       matrixT->num_cols       = A->rmap->n;
1055aa372e3fSPaul Mullowney       matrixT->num_entries    = a->nz;
1056a8bd5306SMark Adams       matrixT->row_offsets    = new THRUSTINTARRAY32(matrixT->num_rows + 1);
1057aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1058aa372e3fSPaul Mullowney       matrixT->values         = new THRUSTARRAY(a->nz);
1059a3fdcf43SKarl Rupp 
1060ad540459SPierre Jolivet       if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
106181902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
1062afb2bd1cSJunchao Zhang 
1063afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
10643606e59fSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1)
10659371c9d4SSatish Balay       stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
10669371c9d4SSatish Balay                                indexBase, cusparse_scalartype);
10679371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
10683606e59fSJunchao Zhang   #else
10693606e59fSJunchao Zhang       /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
10703606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
10713606e59fSJunchao Zhang 
10723606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
10733606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
10743606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
10753606e59fSJunchao Zhang         */
10763606e59fSJunchao Zhang       if (matrixT->num_entries) {
10779371c9d4SSatish Balay         stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype);
10789371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
10793606e59fSJunchao Zhang 
10803606e59fSJunchao Zhang       } else {
10813606e59fSJunchao Zhang         matstructT->matDescr = NULL;
10823606e59fSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
10833606e59fSJunchao Zhang       }
10843606e59fSJunchao Zhang   #endif
1085afb2bd1cSJunchao Zhang #endif
1086aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1087afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1088afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1089afb2bd1cSJunchao Zhang #else
1090aa372e3fSPaul Mullowney       CsrMatrix *temp = new CsrMatrix;
109151c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
109251c6d536SStefano Zampini       /* First convert HYB to CSR */
1093aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1094aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1095aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1096aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1097aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1098aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1099aa372e3fSPaul Mullowney 
11009371c9d4SSatish Balay       stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get());
11019371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1102aa372e3fSPaul Mullowney 
1103aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1104aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1105aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1106aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1107aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1);
1108aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1109aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1110aa372e3fSPaul Mullowney 
11119371c9d4SSatish Balay       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(),
11129371c9d4SSatish Balay                               tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11139371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1114aa372e3fSPaul Mullowney 
1115aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1116aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
11179566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
11189371c9d4SSatish Balay       cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
11199371c9d4SSatish Balay       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition);
11209371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
1121aa372e3fSPaul Mullowney 
1122aa372e3fSPaul Mullowney       /* assign the pointer */
1123aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
11241a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1125aa372e3fSPaul Mullowney       /* delete temporaries */
1126aa372e3fSPaul Mullowney       if (tempT) {
1127aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY *)tempT->values;
1128aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices;
1129aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets;
1130aa372e3fSPaul Mullowney         delete (CsrMatrix *)tempT;
1131087f3262SPaul Mullowney       }
1132aa372e3fSPaul Mullowney       if (temp) {
1133aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY *)temp->values;
1134aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices;
1135aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets;
1136aa372e3fSPaul Mullowney         delete (CsrMatrix *)temp;
1137aa372e3fSPaul Mullowney       }
1138afb2bd1cSJunchao Zhang #endif
1139aa372e3fSPaul Mullowney     }
1140a49f1ed0SStefano Zampini   }
1141a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1142a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix *)matstruct->mat;
1143a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat;
114428b400f6SJacob Faibussowitsch     PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix");
114528b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows");
114628b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols");
114728b400f6SJacob Faibussowitsch     PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values");
114828b400f6SJacob Faibussowitsch     PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT");
114928b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows");
115028b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols");
115128b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values");
1152a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1153a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
1154a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
11559566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
1156a49f1ed0SStefano Zampini     }
1157a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1158a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1159792fecdfSBarry Smith       PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1160a49f1ed0SStefano Zampini 
1161a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1162a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
1163a49f1ed0SStefano Zampini       void  *csr2cscBuffer;
1164a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
11659371c9d4SSatish Balay       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
11669371c9d4SSatish Balay                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize);
11679371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
11689566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize));
1169a49f1ed0SStefano Zampini #endif
1170a49f1ed0SStefano Zampini 
11711a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
11721a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
11731a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
11741a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
11751a2c6b5cSJunchao Zhang 
11761a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
11771a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
11781a2c6b5cSJunchao Zhang         */
11799371c9d4SSatish Balay         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(),
1180a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11819371c9d4SSatish Balay                                 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer);
11829371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1183a49f1ed0SStefano Zampini #else
11849371c9d4SSatish Balay                                 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase);
11859371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
1186a49f1ed0SStefano Zampini #endif
11871a2c6b5cSJunchao Zhang       } else {
11881a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase);
11891a2c6b5cSJunchao Zhang       }
11901a2c6b5cSJunchao Zhang 
1191a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1192792fecdfSBarry Smith       PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt()));
1193a49f1ed0SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
11949566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1195a49f1ed0SStefano Zampini #endif
1196a49f1ed0SStefano Zampini     }
11979371c9d4SSatish Balay     PetscCallThrust(
11989371c9d4SSatish Balay       thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin()));
1199a49f1ed0SStefano Zampini   }
12009566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12019566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0));
1202213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1203213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1204aa372e3fSPaul Mullowney   /* assign the pointer */
1205aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT;
12061a2c6b5cSJunchao Zhang   A->transupdated                                = PETSC_TRUE;
1207bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1208bda325fcSPaul Mullowney }
1209bda325fcSPaul Mullowney 
1210a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
1211*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1212*d71ae5a4SJacob Faibussowitsch {
1213c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1214465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1215465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1216465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1217465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1218bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1219bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1220aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1221aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1222aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1223bda325fcSPaul Mullowney 
1224bda325fcSPaul Mullowney   PetscFunctionBegin;
1225aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1226aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12279566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1228aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1229aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1230bda325fcSPaul Mullowney   }
1231bda325fcSPaul Mullowney 
1232bda325fcSPaul Mullowney   /* Get the GPU pointers */
12339566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
12349566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1235c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1236c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1237bda325fcSPaul Mullowney 
12389566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1239aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
12409371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU);
1241aa372e3fSPaul Mullowney 
1242aa372e3fSPaul Mullowney   /* First, solve U */
12439371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
12441b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1245afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1246afb2bd1cSJunchao Zhang #endif
12479371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray,
12481b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12499371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
12509371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1251d49cd2b7SBarry Smith #else
12529371c9d4SSatish Balay                               tempGPU->data().get());
12539371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1254afb2bd1cSJunchao Zhang #endif
1255aa372e3fSPaul Mullowney 
1256aa372e3fSPaul Mullowney   /* Then, solve L */
12579371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
12581b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1259afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1260afb2bd1cSJunchao Zhang #endif
12619371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1262d49cd2b7SBarry Smith                               tempGPU->data().get(),
12631b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
12649371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
12659371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1266d49cd2b7SBarry Smith #else
12679371c9d4SSatish Balay                               xarray);
12689371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1269afb2bd1cSJunchao Zhang #endif
1270aa372e3fSPaul Mullowney 
1271aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
12729371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin());
1273aa372e3fSPaul Mullowney 
1274aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1275a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU);
1276bda325fcSPaul Mullowney 
1277bda325fcSPaul Mullowney   /* restore */
12789566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
12799566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
12809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
12819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1282bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1283bda325fcSPaul Mullowney }
1284bda325fcSPaul Mullowney 
1285*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1286*d71ae5a4SJacob Faibussowitsch {
1287465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1288465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1289bda325fcSPaul Mullowney   cusparseStatus_t                   stat;
1290bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1291aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1292aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1293aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
1294bda325fcSPaul Mullowney 
1295bda325fcSPaul Mullowney   PetscFunctionBegin;
1296aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1297aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
12989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1299aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose;
1300aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose;
1301bda325fcSPaul Mullowney   }
1302bda325fcSPaul Mullowney 
1303bda325fcSPaul Mullowney   /* Get the GPU pointers */
13049566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13059566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1306bda325fcSPaul Mullowney 
13079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1308aa372e3fSPaul Mullowney   /* First, solve U */
13099371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows,
13101b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1311afb2bd1cSJunchao Zhang                               upTriFactorT->csrMat->num_entries,
1312afb2bd1cSJunchao Zhang #endif
13139371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray,
13141b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13159371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);
13169371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1317d49cd2b7SBarry Smith #else
13189371c9d4SSatish Balay                               tempGPU->data().get());
13199371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1320afb2bd1cSJunchao Zhang #endif
1321aa372e3fSPaul Mullowney 
1322aa372e3fSPaul Mullowney   /* Then, solve L */
13239371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows,
13241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1325afb2bd1cSJunchao Zhang                               loTriFactorT->csrMat->num_entries,
1326afb2bd1cSJunchao Zhang #endif
13279371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1328d49cd2b7SBarry Smith                               tempGPU->data().get(),
13291b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13309371c9d4SSatish Balay                               xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);
13319371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1332d49cd2b7SBarry Smith #else
13339371c9d4SSatish Balay                               xarray);
13349371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1335afb2bd1cSJunchao Zhang #endif
1336bda325fcSPaul Mullowney 
1337bda325fcSPaul Mullowney   /* restore */
13389566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
13399566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
13409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
1342bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1343bda325fcSPaul Mullowney }
1344bda325fcSPaul Mullowney 
1345*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx)
1346*d71ae5a4SJacob Faibussowitsch {
1347465f34aeSAlejandro Lamas Daviña   const PetscScalar                    *barray;
1348465f34aeSAlejandro Lamas Daviña   PetscScalar                          *xarray;
1349465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1350465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
13519ae82921SPaul Mullowney   cusparseStatus_t                      stat;
13529ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors         *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1353aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1354aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct    *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1355aa372e3fSPaul Mullowney   THRUSTARRAY                          *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
13569ae82921SPaul Mullowney 
13579ae82921SPaul Mullowney   PetscFunctionBegin;
1358ebc8f436SDominic Meiser 
1359e057df02SPaul Mullowney   /* Get the GPU pointers */
13609566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
13619566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
1362c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1363c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
13649ae82921SPaul Mullowney 
13659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1366aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
13679371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin());
1368aa372e3fSPaul Mullowney 
1369aa372e3fSPaul Mullowney   /* Next, solve L */
13709371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
13711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1372afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1373afb2bd1cSJunchao Zhang #endif
13749371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
1375d49cd2b7SBarry Smith                               tempGPU->data().get(),
13761b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13779371c9d4SSatish Balay                               xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer);
13789371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1379d49cd2b7SBarry Smith #else
13809371c9d4SSatish Balay                               xarray);
13819371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1382afb2bd1cSJunchao Zhang #endif
1383aa372e3fSPaul Mullowney 
1384aa372e3fSPaul Mullowney   /* Then, solve U */
13859371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
13861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1387afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1388afb2bd1cSJunchao Zhang #endif
13899371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray,
13901b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
13919371c9d4SSatish Balay                               tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer);
13929371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1393d49cd2b7SBarry Smith #else
13949371c9d4SSatish Balay                               tempGPU->data().get());
13959371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1396afb2bd1cSJunchao Zhang #endif
1397d49cd2b7SBarry Smith 
13984e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
13999371c9d4SSatish Balay   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU);
14009ae82921SPaul Mullowney 
14019566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
14029566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
14039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
14059ae82921SPaul Mullowney   PetscFunctionReturn(0);
14069ae82921SPaul Mullowney }
14079ae82921SPaul Mullowney 
1408*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx)
1409*d71ae5a4SJacob Faibussowitsch {
1410465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1411465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
14129ae82921SPaul Mullowney   cusparseStatus_t                   stat;
14139ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
1414aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr;
1415aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor        = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr;
1416aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU            = (THRUSTARRAY *)cusparseTriFactors->workVector;
14179ae82921SPaul Mullowney 
14189ae82921SPaul Mullowney   PetscFunctionBegin;
1419e057df02SPaul Mullowney   /* Get the GPU pointers */
14209566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx, &xarray));
14219566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb, &barray));
14229ae82921SPaul Mullowney 
14239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1424aa372e3fSPaul Mullowney   /* First, solve L */
14259371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows,
14261b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1427afb2bd1cSJunchao Zhang                               loTriFactor->csrMat->num_entries,
1428afb2bd1cSJunchao Zhang #endif
14299371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray,
14301b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
14319371c9d4SSatish Balay                               tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer);
14329371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1433d49cd2b7SBarry Smith #else
14349371c9d4SSatish Balay                               tempGPU->data().get());
14359371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1436afb2bd1cSJunchao Zhang #endif
1437d49cd2b7SBarry Smith 
1438aa372e3fSPaul Mullowney   /* Next, solve U */
14399371c9d4SSatish Balay   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows,
14401b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
1441afb2bd1cSJunchao Zhang                               upTriFactor->csrMat->num_entries,
1442afb2bd1cSJunchao Zhang #endif
14439371c9d4SSatish Balay                               &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
1444d49cd2b7SBarry Smith                               tempGPU->data().get(),
14451b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0)
14469371c9d4SSatish Balay                               xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer);
14479371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1448d49cd2b7SBarry Smith #else
14499371c9d4SSatish Balay                               xarray);
14509371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
1451afb2bd1cSJunchao Zhang #endif
14529ae82921SPaul Mullowney 
14539566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb, &barray));
14549566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx, &xarray));
14559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n));
14579ae82921SPaul Mullowney   PetscFunctionReturn(0);
14589ae82921SPaul Mullowney }
14599ae82921SPaul Mullowney 
1460da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1461da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1462*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1463*d71ae5a4SJacob Faibussowitsch {
1464da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1465da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1466da112707SJunchao Zhang   const PetscScalar            *barray;
1467da112707SJunchao Zhang   PetscScalar                  *xarray;
1468da112707SJunchao Zhang 
1469da112707SJunchao Zhang   PetscFunctionBegin;
1470da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1471da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1472da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1473da112707SJunchao Zhang 
1474da112707SJunchao Zhang   /* Solve L*y = b */
1475da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1476da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
14779371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
14789371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT,
147912ba2bc6SJunchao Zhang                                        fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1480da112707SJunchao Zhang 
1481da112707SJunchao Zhang   /* Solve U*x = y */
1482da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
14839371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* U X = Y */
14849371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U));
1485da112707SJunchao Zhang 
1486da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1487da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1488da112707SJunchao Zhang 
1489da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1490da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1491da112707SJunchao Zhang   PetscFunctionReturn(0);
1492da112707SJunchao Zhang }
1493da112707SJunchao Zhang 
1494*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact, Vec b, Vec x)
1495*d71ae5a4SJacob Faibussowitsch {
1496da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1497da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1498da112707SJunchao Zhang   const PetscScalar            *barray;
1499da112707SJunchao Zhang   PetscScalar                  *xarray;
1500da112707SJunchao Zhang 
1501da112707SJunchao Zhang   PetscFunctionBegin;
150212ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1503da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
15049371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
15059371c9d4SSatish Balay                                               fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1506da112707SJunchao Zhang 
1507da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
15089371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut));
1509da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
151012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut));
151112ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
151212ba2bc6SJunchao Zhang   }
1513da112707SJunchao Zhang 
151412ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
15159371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1516da112707SJunchao Zhang 
15179371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut));
151812ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1519da112707SJunchao Zhang   }
1520da112707SJunchao Zhang 
1521da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1522da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1523da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1524da112707SJunchao Zhang 
1525da112707SJunchao Zhang   /* Solve Ut*y = b */
1526da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1527da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
15289371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, /* Ut Y = X */
15299371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut));
1530da112707SJunchao Zhang 
1531da112707SJunchao Zhang   /* Solve Lt*x = y */
1532da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
15339371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
15349371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1535da112707SJunchao Zhang 
1536da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1537da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1538da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1539da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1540da112707SJunchao Zhang   PetscFunctionReturn(0);
1541da112707SJunchao Zhang }
1542da112707SJunchao Zhang 
1543*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *info)
1544*d71ae5a4SJacob Faibussowitsch {
1545da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1546da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1547da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1548da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1549da112707SJunchao Zhang   PetscInt                      m, nz;
1550da112707SJunchao Zhang   PetscBool                     flg;
1551da112707SJunchao Zhang 
1552da112707SJunchao Zhang   PetscFunctionBegin;
1553da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1554da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1555da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1556da112707SJunchao Zhang   }
1557da112707SJunchao Zhang 
1558da112707SJunchao Zhang   /* Copy A's value to fact */
1559da112707SJunchao Zhang   m  = fact->rmap->n;
1560da112707SJunchao Zhang   nz = aij->nz;
1561da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1562da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1563da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1564da112707SJunchao Zhang 
1565da112707SJunchao Zhang   /* Factorize fact inplace */
15669371c9d4SSatish Balay   if (m)
15679371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
15689371c9d4SSatish Balay                                         fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1569da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1570da112707SJunchao Zhang     int              numerical_zero;
1571da112707SJunchao Zhang     cusparseStatus_t status;
1572da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1573da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1574da112707SJunchao Zhang   }
1575da112707SJunchao Zhang 
157612ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
157712ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
157812ba2bc6SJunchao Zhang   */
15799371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1580da112707SJunchao Zhang 
15819371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U));
1582da112707SJunchao Zhang 
158312ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
158412ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
158512ba2bc6SJunchao Zhang 
1586da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1587da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1588da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1589da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1590da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1591da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1592da112707SJunchao Zhang   PetscFunctionReturn(0);
1593da112707SJunchao Zhang }
1594da112707SJunchao Zhang 
1595*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1596*d71ae5a4SJacob Faibussowitsch {
1597da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1598da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1599da112707SJunchao Zhang   PetscInt                      m, nz;
1600da112707SJunchao Zhang 
1601da112707SJunchao Zhang   PetscFunctionBegin;
1602da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1603da112707SJunchao Zhang     PetscInt  i;
1604da112707SJunchao Zhang     PetscBool flg, missing;
1605da112707SJunchao Zhang 
1606da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1607da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1608da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1609da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1610da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1611da112707SJunchao Zhang   }
1612da112707SJunchao Zhang 
1613da112707SJunchao Zhang   /* Free the old stale stuff */
1614da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1615da112707SJunchao Zhang 
1616da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1617da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1618da112707SJunchao Zhang    */
1619da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1620da112707SJunchao Zhang 
1621da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1622da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1623da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1624da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1625da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1626da112707SJunchao Zhang 
1627da112707SJunchao Zhang   aij->row = NULL;
1628da112707SJunchao Zhang   aij->col = NULL;
1629da112707SJunchao Zhang 
1630da112707SJunchao Zhang   /* ====================================================================== */
1631da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1632da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1633da112707SJunchao Zhang   /* ====================================================================== */
1634da112707SJunchao Zhang   const int *Ai, *Aj;
1635da112707SJunchao Zhang 
1636da112707SJunchao Zhang   m  = fact->rmap->n;
1637da112707SJunchao Zhang   nz = aij->nz;
1638da112707SJunchao Zhang 
1639da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1640da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1641da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1642da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1643da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1644da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1645da112707SJunchao Zhang 
1646da112707SJunchao Zhang   /* ====================================================================== */
1647da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1648da112707SJunchao Zhang   /* ====================================================================== */
1649da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1650da112707SJunchao Zhang   cusparseDiagType_t diagType;
1651da112707SJunchao Zhang 
1652da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1653da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1654da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1655da112707SJunchao Zhang 
1656da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1657da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1658da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1659da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1660da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1661da112707SJunchao Zhang   */
1662da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1663da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
16649371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16659371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16669371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1667da112707SJunchao Zhang 
1668da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1669da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
16709371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
16719371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
16729371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1673da112707SJunchao Zhang 
1674da112707SJunchao Zhang   /* ========================================================================= */
1675da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1676da112707SJunchao Zhang   /* ========================================================================= */
1677da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
16789371c9d4SSatish Balay   if (m)
16799371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
16809371c9d4SSatish Balay                                                    fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, &fs->factBufferSize_M));
1681da112707SJunchao Zhang 
1682da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1683da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1684da112707SJunchao Zhang 
1685da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1686da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1687da112707SJunchao Zhang 
1688da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
16899371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1690da112707SJunchao Zhang 
1691da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
16929371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U));
1693da112707SJunchao Zhang 
1694da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
169512ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
169612ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
169712ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1698da112707SJunchao Zhang    */
169912ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
170012ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
170112ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1702da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U));
170312ba2bc6SJunchao Zhang   } else {
170412ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M)));
170512ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1706da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
170712ba2bc6SJunchao Zhang   }
1708da112707SJunchao Zhang 
1709da112707SJunchao Zhang   /* ========================================================================== */
1710da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1711da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1712da112707SJunchao Zhang   /* ========================================================================== */
1713da112707SJunchao Zhang   int              structural_zero;
1714da112707SJunchao Zhang   cusparseStatus_t status;
1715da112707SJunchao Zhang 
1716da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
17179371c9d4SSatish Balay   if (m)
17189371c9d4SSatish Balay     PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
17199371c9d4SSatish Balay                                                  fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M));
1720da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1721da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1722da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
1723da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero);
1724da112707SJunchao Zhang   }
1725da112707SJunchao Zhang 
1726da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
17270dd8c0acSJunchao Zhang   {
1728da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
17290dd8c0acSJunchao Zhang     PetscInt      *Ai, *Adiag, nzRow, nzLeft;
1730da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1731da112707SJunchao Zhang 
1732da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
1733da112707SJunchao Zhang     Ai    = Aseq->i;
1734da112707SJunchao Zhang     Adiag = Aseq->diag;
1735da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1736da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */
1737da112707SJunchao Zhang         nzRow  = Ai[i + 1] - Ai[i];
1738da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
1739da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1740da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1741da112707SJunchao Zhang         */
1742da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1743da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1744da112707SJunchao Zhang       }
1745da112707SJunchao Zhang     }
1746da112707SJunchao Zhang     fs->numericFactFlops = flops;
17470dd8c0acSJunchao Zhang   }
1748da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
1749da112707SJunchao Zhang   PetscFunctionReturn(0);
1750da112707SJunchao Zhang }
1751da112707SJunchao Zhang 
1752*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x)
1753*d71ae5a4SJacob Faibussowitsch {
1754da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1755da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1756da112707SJunchao Zhang   const PetscScalar            *barray;
1757da112707SJunchao Zhang   PetscScalar                  *xarray;
1758da112707SJunchao Zhang 
1759da112707SJunchao Zhang   PetscFunctionBegin;
1760da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x, &xarray));
1761da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b, &barray));
1762da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1763da112707SJunchao Zhang 
1764da112707SJunchao Zhang   /* Solve L*y = b */
1765da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray));
1766da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y));
17679371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */
17689371c9d4SSatish Balay                                        fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L));
1769da112707SJunchao Zhang 
1770da112707SJunchao Zhang   /* Solve Lt*x = y */
1771da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray));
17729371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */
17739371c9d4SSatish Balay                                        fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt));
1774da112707SJunchao Zhang 
1775da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b, &barray));
1776da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x, &xarray));
1777da112707SJunchao Zhang 
1778da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1779da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n));
1780da112707SJunchao Zhang   PetscFunctionReturn(0);
1781da112707SJunchao Zhang }
1782da112707SJunchao Zhang 
1783*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *info)
1784*d71ae5a4SJacob Faibussowitsch {
1785da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs    = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1786da112707SJunchao Zhang   Mat_SeqAIJ                   *aij   = (Mat_SeqAIJ *)fact->data;
1787da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
1788da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1789da112707SJunchao Zhang   PetscInt                      m, nz;
1790da112707SJunchao Zhang   PetscBool                     flg;
1791da112707SJunchao Zhang 
1792da112707SJunchao Zhang   PetscFunctionBegin;
1793da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1794da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1795da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1796da112707SJunchao Zhang   }
1797da112707SJunchao Zhang 
1798da112707SJunchao Zhang   /* Copy A's value to fact */
1799da112707SJunchao Zhang   m  = fact->rmap->n;
1800da112707SJunchao Zhang   nz = aij->nz;
1801da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1802da112707SJunchao Zhang   Acsr = (CsrMatrix *)Acusp->mat->mat;
1803da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1804da112707SJunchao Zhang 
1805da112707SJunchao Zhang   /* Factorize fact inplace */
1806da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
1807da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
1808da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
1809da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
1810da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
1811da112707SJunchao Zhang    */
18129371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1813da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1814da112707SJunchao Zhang     int              numerical_zero;
1815da112707SJunchao Zhang     cusparseStatus_t status;
1816da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
1817da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero);
1818da112707SJunchao Zhang   }
1819da112707SJunchao Zhang 
18209371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L));
1821da112707SJunchao Zhang 
1822da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
1823da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
1824da112707SJunchao Zhang   */
18259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt));
1826da112707SJunchao Zhang 
1827da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1828da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
1829da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
1830da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1831da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1832da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1833da112707SJunchao Zhang   PetscFunctionReturn(0);
1834da112707SJunchao Zhang }
1835da112707SJunchao Zhang 
1836*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1837*d71ae5a4SJacob Faibussowitsch {
1838da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs  = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr;
1839da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ *)fact->data;
1840da112707SJunchao Zhang   PetscInt                      m, nz;
1841da112707SJunchao Zhang 
1842da112707SJunchao Zhang   PetscFunctionBegin;
1843da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1844da112707SJunchao Zhang     PetscInt  i;
1845da112707SJunchao Zhang     PetscBool flg, missing;
1846da112707SJunchao Zhang 
1847da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
1848da112707SJunchao Zhang     PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name);
1849da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
1850da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A, &missing, &i));
1851da112707SJunchao Zhang     PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1852da112707SJunchao Zhang   }
1853da112707SJunchao Zhang 
1854da112707SJunchao Zhang   /* Free the old stale stuff */
1855da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1856da112707SJunchao Zhang 
1857da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1858da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1859da112707SJunchao Zhang    */
1860da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/));
1861da112707SJunchao Zhang 
1862da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1863da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
1864da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1865da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1866da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1867da112707SJunchao Zhang 
1868da112707SJunchao Zhang   aij->row = NULL;
1869da112707SJunchao Zhang   aij->col = NULL;
1870da112707SJunchao Zhang 
1871da112707SJunchao Zhang   /* ====================================================================== */
1872da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1873da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1874da112707SJunchao Zhang   /* ====================================================================== */
1875da112707SJunchao Zhang   const int *Ai, *Aj;
1876da112707SJunchao Zhang 
1877da112707SJunchao Zhang   m  = fact->rmap->n;
1878da112707SJunchao Zhang   nz = aij->nz;
1879da112707SJunchao Zhang 
1880da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr, sizeof(int) * (m + 1)));
1881da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx, sizeof(int) * nz));
1882da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz));
1883da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */
1884da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr, Ai, sizeof(int) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1885da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx, Aj, sizeof(int) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream));
1886da112707SJunchao Zhang 
1887da112707SJunchao Zhang   /* ====================================================================== */
1888da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
1889da112707SJunchao Zhang   /* ====================================================================== */
1890da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1891da112707SJunchao Zhang   cusparseDiagType_t diagType;
1892da112707SJunchao Zhang 
1893da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1894da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1895da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1896da112707SJunchao Zhang 
1897da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1898da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1899da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1900da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1901da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1902da112707SJunchao Zhang   */
1903da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1904da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
19059371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype));
19069371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode)));
19079371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType)));
1908da112707SJunchao Zhang 
1909da112707SJunchao Zhang   /* ========================================================================= */
1910da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
1911da112707SJunchao Zhang   /* ========================================================================= */
1912da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
19139371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, &fs->factBufferSize_M));
1914da112707SJunchao Zhang 
1915da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m));
1916da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m));
1917da112707SJunchao Zhang 
1918da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype));
1919da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype));
1920da112707SJunchao Zhang 
1921da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
19229371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L));
1923da112707SJunchao Zhang 
1924da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
19259371c9d4SSatish Balay   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt));
1926da112707SJunchao Zhang 
192712ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
192812ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
192912ba2bc6SJunchao Zhang    */
193012ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
193112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M)));
193212ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1933da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt));
193412ba2bc6SJunchao Zhang   } else {
193512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M)));
193612ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
193712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L));
193812ba2bc6SJunchao Zhang   }
1939da112707SJunchao Zhang 
1940da112707SJunchao Zhang   /* ========================================================================== */
1941da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
1942da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
1943da112707SJunchao Zhang   /* ========================================================================== */
1944da112707SJunchao Zhang   int              structural_zero;
1945da112707SJunchao Zhang   cusparseStatus_t status;
1946da112707SJunchao Zhang 
1947da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
19489371c9d4SSatish Balay   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr, fs->csrColIdx, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M));
1949da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1950da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
1951da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
1952da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero);
1953da112707SJunchao Zhang   }
1954da112707SJunchao Zhang 
1955da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
19560dd8c0acSJunchao Zhang   {
1957da112707SJunchao Zhang     Mat_SeqAIJ    *Aseq = (Mat_SeqAIJ *)A->data;
19580dd8c0acSJunchao Zhang     PetscInt      *Ai, nzRow, nzLeft;
1959da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
1960da112707SJunchao Zhang 
1961da112707SJunchao Zhang     Ai = Aseq->i;
1962da112707SJunchao Zhang     for (PetscInt i = 0; i < m; i++) {
1963da112707SJunchao Zhang       nzRow = Ai[i + 1] - Ai[i];
1964da112707SJunchao Zhang       if (nzRow > 1) {
1965da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
1966da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
1967da112707SJunchao Zhang         */
1968da112707SJunchao Zhang         nzLeft = (nzRow - 1) / 2;
1969da112707SJunchao Zhang         flops += nzLeft * (2.0 * nzRow - nzLeft + 1);
1970da112707SJunchao Zhang       }
1971da112707SJunchao Zhang     }
1972da112707SJunchao Zhang     fs->numericFactFlops = flops;
19730dd8c0acSJunchao Zhang   }
1974da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
1975da112707SJunchao Zhang   PetscFunctionReturn(0);
1976da112707SJunchao Zhang }
1977da112707SJunchao Zhang #endif
1978da112707SJunchao Zhang 
1979*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
1980*d71ae5a4SJacob Faibussowitsch {
1981da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
1982da112707SJunchao Zhang 
1983da112707SJunchao Zhang   PetscFunctionBegin;
1984da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1985bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
1986bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
1987da112707SJunchao Zhang     PetscCall(ISIdentity(isrow, &row_identity));
1988da112707SJunchao Zhang     PetscCall(ISIdentity(iscol, &col_identity));
1989bc996fdcSJunchao Zhang   }
1990da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
1991da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info));
1992da112707SJunchao Zhang   } else
1993da112707SJunchao Zhang #endif
1994da112707SJunchao Zhang   {
1995da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
1996da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
1997da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
1998da112707SJunchao Zhang   }
1999da112707SJunchao Zhang   PetscFunctionReturn(0);
2000da112707SJunchao Zhang }
2001da112707SJunchao Zhang 
2002*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2003*d71ae5a4SJacob Faibussowitsch {
2004da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2005da112707SJunchao Zhang 
2006da112707SJunchao Zhang   PetscFunctionBegin;
2007da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2008da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2009da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2010da112707SJunchao Zhang   PetscFunctionReturn(0);
2011da112707SJunchao Zhang }
2012da112707SJunchao Zhang 
2013*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2014*d71ae5a4SJacob Faibussowitsch {
2015da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2016da112707SJunchao Zhang 
2017da112707SJunchao Zhang   PetscFunctionBegin;
2018da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
2019bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2020bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm, &perm_identity));
2021da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2022da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info));
2023da112707SJunchao Zhang   } else
2024da112707SJunchao Zhang #endif
2025da112707SJunchao Zhang   {
2026da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2027da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2028da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2029da112707SJunchao Zhang   }
2030da112707SJunchao Zhang   PetscFunctionReturn(0);
2031da112707SJunchao Zhang }
2032da112707SJunchao Zhang 
2033*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2034*d71ae5a4SJacob Faibussowitsch {
2035da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr;
2036da112707SJunchao Zhang 
2037da112707SJunchao Zhang   PetscFunctionBegin;
2038da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2039da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info));
2040da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2041da112707SJunchao Zhang   PetscFunctionReturn(0);
2042da112707SJunchao Zhang }
2043da112707SJunchao Zhang 
2044*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A, MatSolverType *type)
2045*d71ae5a4SJacob Faibussowitsch {
2046841d4cb1SJunchao Zhang   PetscFunctionBegin;
2047841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
2048841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2049841d4cb1SJunchao Zhang }
2050841d4cb1SJunchao Zhang 
2051841d4cb1SJunchao Zhang /*MC
2052841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
205311a5261eSBarry Smith   on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported
2054841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2055841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
205611a5261eSBarry Smith   CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2057841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2058841d4cb1SJunchao Zhang 
2059841d4cb1SJunchao Zhang   Level: beginner
2060841d4cb1SJunchao Zhang 
206111a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2062841d4cb1SJunchao Zhang M*/
2063841d4cb1SJunchao Zhang 
2064*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B)
2065*d71ae5a4SJacob Faibussowitsch {
2066841d4cb1SJunchao Zhang   PetscInt  n = A->rmap->n;
2067bc996fdcSJunchao Zhang   PetscBool factOnDevice, factOnHost;
2068bc996fdcSJunchao Zhang   char     *prefix;
2069bc996fdcSJunchao Zhang   char      factPlace[32] = "device"; /* the default */
2070841d4cb1SJunchao Zhang 
2071841d4cb1SJunchao Zhang   PetscFunctionBegin;
2072841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
2073841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B, n, n, n, n));
2074841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
2075841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE));
2076841d4cb1SJunchao Zhang 
2077bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2078bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)), prefix, "MatGetFactor", "Mat");
2079bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization", "Do matrix factorization on host or device when possible", "MatGetFactor", NULL, factPlace, sizeof(factPlace), NULL));
2080bc996fdcSJunchao Zhang   PetscOptionsEnd();
2081bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device", factPlace, &factOnDevice));
2082bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host", factPlace, &factOnHost));
2083bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost, PetscObjectComm((PetscObject)(*B)), PETSC_ERR_ARG_OUTOFRANGE, "Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed", factPlace);
2084bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors *)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2085bc996fdcSJunchao Zhang 
2086841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE));
2087841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2088841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2089841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2090841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2091841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2092841d4cb1SJunchao Zhang     } else {
2093841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2094841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2095841d4cb1SJunchao Zhang     }
2096841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2097841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2098841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2099841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2100841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2101841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2102841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2103841d4cb1SJunchao Zhang     } else {
2104841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2105841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2106841d4cb1SJunchao Zhang     }
2107841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2108841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2109841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types");
2110841d4cb1SJunchao Zhang 
2111841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2112841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2113841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse));
2114841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2115841d4cb1SJunchao Zhang }
2116841d4cb1SJunchao Zhang 
2117*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
2118*d71ae5a4SJacob Faibussowitsch {
21197e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
21207e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
21210dd8c0acSJunchao Zhang #if CUSPARSE_VERSION >= 13500
2122da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr;
21230dd8c0acSJunchao Zhang #endif
21247e8381f9SStefano Zampini 
21257e8381f9SStefano Zampini   PetscFunctionBegin;
21267e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
21279566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
2128da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2129da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat;
21309566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2131da112707SJunchao Zhang     }
2132da112707SJunchao Zhang #if CUSPARSE_VERSION >= 13500
2133da112707SJunchao Zhang     else if (fs->csrVal) {
2134da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2135da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2136da112707SJunchao Zhang     }
2137da112707SJunchao Zhang #endif
21389371c9d4SSatish Balay     else
21399371c9d4SSatish Balay       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host");
21409566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar)));
21419566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0));
21427e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
21437e8381f9SStefano Zampini   }
21447e8381f9SStefano Zampini   PetscFunctionReturn(0);
21457e8381f9SStefano Zampini }
21467e8381f9SStefano Zampini 
2147*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2148*d71ae5a4SJacob Faibussowitsch {
21497e8381f9SStefano Zampini   PetscFunctionBegin;
21509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
215167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
215267a45760SJunchao Zhang   PetscFunctionReturn(0);
215367a45760SJunchao Zhang }
215467a45760SJunchao Zhang 
2155*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2156*d71ae5a4SJacob Faibussowitsch {
215767a45760SJunchao Zhang   PetscFunctionBegin;
21587e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
215967a45760SJunchao Zhang   *array         = NULL;
216067a45760SJunchao Zhang   PetscFunctionReturn(0);
216167a45760SJunchao Zhang }
216267a45760SJunchao Zhang 
2163*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2164*d71ae5a4SJacob Faibussowitsch {
216567a45760SJunchao Zhang   PetscFunctionBegin;
21669566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
216767a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
216867a45760SJunchao Zhang   PetscFunctionReturn(0);
216967a45760SJunchao Zhang }
217067a45760SJunchao Zhang 
2171*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[])
2172*d71ae5a4SJacob Faibussowitsch {
217367a45760SJunchao Zhang   PetscFunctionBegin;
217467a45760SJunchao Zhang   *array = NULL;
217567a45760SJunchao Zhang   PetscFunctionReturn(0);
217667a45760SJunchao Zhang }
217767a45760SJunchao Zhang 
2178*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2179*d71ae5a4SJacob Faibussowitsch {
218067a45760SJunchao Zhang   PetscFunctionBegin;
218167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ *)A->data)->a;
218267a45760SJunchao Zhang   PetscFunctionReturn(0);
218367a45760SJunchao Zhang }
218467a45760SJunchao Zhang 
2185*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[])
2186*d71ae5a4SJacob Faibussowitsch {
218767a45760SJunchao Zhang   PetscFunctionBegin;
218867a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
218967a45760SJunchao Zhang   *array         = NULL;
21907e8381f9SStefano Zampini   PetscFunctionReturn(0);
21917e8381f9SStefano Zampini }
21927e8381f9SStefano Zampini 
2193*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
2194*d71ae5a4SJacob Faibussowitsch {
21957ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE *cusp;
21967ee59b9bSJunchao Zhang   CsrMatrix          *matrix;
21977ee59b9bSJunchao Zhang 
21987ee59b9bSJunchao Zhang   PetscFunctionBegin;
21997ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
22007ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix");
22017ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr);
22027ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL");
22037ee59b9bSJunchao Zhang   matrix = (CsrMatrix *)cusp->mat->mat;
22047ee59b9bSJunchao Zhang 
22057ee59b9bSJunchao Zhang   if (i) {
22067ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
22077ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
22087ee59b9bSJunchao Zhang #else
22097ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
22107ee59b9bSJunchao Zhang #endif
22117ee59b9bSJunchao Zhang   }
22127ee59b9bSJunchao Zhang   if (j) {
22137ee59b9bSJunchao Zhang #if !defined(PETSC_USE_64BIT_INDICES)
22147ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
22157ee59b9bSJunchao Zhang #else
22167ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices");
22177ee59b9bSJunchao Zhang #endif
22187ee59b9bSJunchao Zhang   }
22197ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
22207ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
22217ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
22227ee59b9bSJunchao Zhang }
22237ee59b9bSJunchao Zhang 
2224*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
2225*d71ae5a4SJacob Faibussowitsch {
2226aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
22277c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct      = cusparsestruct->mat;
22289ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
2229213423ffSJunchao Zhang   PetscInt                      m              = A->rmap->n, *ii, *ridx, tmp;
2230aa372e3fSPaul Mullowney   cusparseStatus_t              stat;
2231abb89eb1SStefano Zampini   PetscBool                     both = PETSC_TRUE;
22329ae82921SPaul Mullowney 
22339ae82921SPaul Mullowney   PetscFunctionBegin;
223428b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU");
2235c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2236a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2237a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2238afb2bd1cSJunchao Zhang       matrix = (CsrMatrix *)cusparsestruct->mat->mat;
223985ba7357SStefano Zampini 
224008401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values");
22419566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
2242afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a + a->nz);
22439566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
22449566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz) * sizeof(PetscScalar)));
22459566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
22469566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
224734d6c7a5SJose E. Roman     } else {
2248abb89eb1SStefano Zampini       PetscInt nnz;
22499566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
22509566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format));
22519566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
22527c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
225381902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2254a49f1ed0SStefano Zampini       cusparsestruct->workVector     = NULL;
2255a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
22569ae82921SPaul Mullowney       try {
22579ae82921SPaul Mullowney         if (a->compressedrow.use) {
22589ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
22599ae82921SPaul Mullowney           ii   = a->compressedrow.i;
22609ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
22619ae82921SPaul Mullowney         } else {
2262213423ffSJunchao Zhang           m    = A->rmap->n;
2263213423ffSJunchao Zhang           ii   = a->i;
2264e6e9a74fSStefano Zampini           ridx = NULL;
22659ae82921SPaul Mullowney         }
226608401ef6SPierre Jolivet         PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data");
22679371c9d4SSatish Balay         if (!a->a) {
22689371c9d4SSatish Balay           nnz  = ii[m];
22699371c9d4SSatish Balay           both = PETSC_FALSE;
22709371c9d4SSatish Balay         } else nnz = a->nz;
227108401ef6SPierre Jolivet         PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data");
22729ae82921SPaul Mullowney 
227385ba7357SStefano Zampini         /* create cusparse matrix */
2274abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2275aa372e3fSPaul Mullowney         matstruct             = new Mat_SeqAIJCUSPARSEMultStruct;
22769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
22779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
22789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
22799ae82921SPaul Mullowney 
22809566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar)));
22819566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero), sizeof(PetscScalar)));
22829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
22839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
22869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2287b06137fdSPaul Mullowney 
2288aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2289aa372e3fSPaul Mullowney         if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2290aa372e3fSPaul Mullowney           /* set the matrix */
2291afb2bd1cSJunchao Zhang           CsrMatrix *mat   = new CsrMatrix;
2292afb2bd1cSJunchao Zhang           mat->num_rows    = m;
2293afb2bd1cSJunchao Zhang           mat->num_cols    = A->cmap->n;
2294abb89eb1SStefano Zampini           mat->num_entries = nnz;
2295afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2296afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
22979ae82921SPaul Mullowney 
2298abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2299abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2300aa372e3fSPaul Mullowney 
2301abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2302abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2303aa372e3fSPaul Mullowney 
2304aa372e3fSPaul Mullowney           /* assign the pointer */
2305afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2306afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2307afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
23089371c9d4SSatish Balay             stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
23099371c9d4SSatish Balay                                      CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
23109371c9d4SSatish Balay             PetscCallCUSPARSE(stat);
2311afb2bd1cSJunchao Zhang           }
2312afb2bd1cSJunchao Zhang #endif
2313aa372e3fSPaul Mullowney         } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
2314afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2315afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2316afb2bd1cSJunchao Zhang #else
2317afb2bd1cSJunchao Zhang           CsrMatrix *mat = new CsrMatrix;
2318afb2bd1cSJunchao Zhang           mat->num_rows = m;
2319afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2320abb89eb1SStefano Zampini           mat->num_entries = nnz;
2321afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m + 1);
2322afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m + 1);
2323aa372e3fSPaul Mullowney 
2324abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2325abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j + nnz);
2326aa372e3fSPaul Mullowney 
2327abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2328abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a + nnz);
2329aa372e3fSPaul Mullowney 
2330aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
23319566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
23329371c9d4SSatish Balay           cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
23339371c9d4SSatish Balay           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition);
23349371c9d4SSatish Balay           PetscCallCUSPARSE(stat);
2335aa372e3fSPaul Mullowney           /* assign the pointer */
2336aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2337aa372e3fSPaul Mullowney 
2338afb2bd1cSJunchao Zhang           if (mat) {
2339afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY *)mat->values;
2340afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices;
2341afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets;
2342afb2bd1cSJunchao Zhang             delete (CsrMatrix *)mat;
2343087f3262SPaul Mullowney           }
2344afb2bd1cSJunchao Zhang #endif
2345087f3262SPaul Mullowney         }
2346ca45077fSPaul Mullowney 
2347aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2348213423ffSJunchao Zhang         if (a->compressedrow.use) {
2349213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2350aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2351aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx, ridx + m);
2352213423ffSJunchao Zhang           tmp = m;
2353213423ffSJunchao Zhang         } else {
2354213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2355213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2356213423ffSJunchao Zhang           tmp                        = 0;
2357213423ffSJunchao Zhang         }
23589566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar)));
2359aa372e3fSPaul Mullowney 
2360aa372e3fSPaul Mullowney         /* assign the pointer */
2361aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
2362*d71ae5a4SJacob Faibussowitsch       } catch (char *ex) {
2363*d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
2364*d71ae5a4SJacob Faibussowitsch       }
23659566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
23669566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0));
236734d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
236834d6c7a5SJose E. Roman     }
2369abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
23709ae82921SPaul Mullowney   }
23719ae82921SPaul Mullowney   PetscFunctionReturn(0);
23729ae82921SPaul Mullowney }
23739ae82921SPaul Mullowney 
23749371c9d4SSatish Balay struct VecCUDAPlusEquals {
2375aa372e3fSPaul Mullowney   template <typename Tuple>
2376*d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2377*d71ae5a4SJacob Faibussowitsch   {
2378aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2379aa372e3fSPaul Mullowney   }
2380aa372e3fSPaul Mullowney };
2381aa372e3fSPaul Mullowney 
23829371c9d4SSatish Balay struct VecCUDAEquals {
23837e8381f9SStefano Zampini   template <typename Tuple>
2384*d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2385*d71ae5a4SJacob Faibussowitsch   {
23867e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
23877e8381f9SStefano Zampini   }
23887e8381f9SStefano Zampini };
23897e8381f9SStefano Zampini 
23909371c9d4SSatish Balay struct VecCUDAEqualsReverse {
2391e6e9a74fSStefano Zampini   template <typename Tuple>
2392*d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
2393*d71ae5a4SJacob Faibussowitsch   {
2394e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2395e6e9a74fSStefano Zampini   }
2396e6e9a74fSStefano Zampini };
2397e6e9a74fSStefano Zampini 
2398afb2bd1cSJunchao Zhang struct MatMatCusparse {
2399ccdfe979SStefano Zampini   PetscBool      cisdense;
2400ccdfe979SStefano Zampini   PetscScalar   *Bt;
2401ccdfe979SStefano Zampini   Mat            X;
2402fcdce8c4SStefano Zampini   PetscBool      reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2403fcdce8c4SStefano Zampini   PetscLogDouble flops;
2404fcdce8c4SStefano Zampini   CsrMatrix     *Bcsr;
2405b4285af6SJunchao Zhang 
2406afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2407fcdce8c4SStefano Zampini   cusparseSpMatDescr_t matSpBDescr;
2408afb2bd1cSJunchao Zhang   PetscBool            initialized; /* C = alpha op(A) op(B) + beta C */
2409afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
2410afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
2411afb2bd1cSJunchao Zhang   PetscInt             Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/
2412b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2413b4285af6SJunchao Zhang   void *dBuffer4;
2414b4285af6SJunchao Zhang   void *dBuffer5;
2415b4285af6SJunchao Zhang   #endif
2416fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2417fcdce8c4SStefano Zampini   void                 *mmBuffer;
2418fcdce8c4SStefano Zampini   void                 *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2419fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2420afb2bd1cSJunchao Zhang #endif
2421afb2bd1cSJunchao Zhang };
2422ccdfe979SStefano Zampini 
2423*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2424*d71ae5a4SJacob Faibussowitsch {
2425ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2426ccdfe979SStefano Zampini 
2427ccdfe979SStefano Zampini   PetscFunctionBegin;
24289566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2429fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2430afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
24319566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
24329566063dSJacob Faibussowitsch   if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
24339566063dSJacob Faibussowitsch   if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
24349566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2435b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
24369566063dSJacob Faibussowitsch   if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4));
24379566063dSJacob Faibussowitsch   if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2438b4285af6SJunchao Zhang   #endif
24399566063dSJacob Faibussowitsch   if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer));
24409566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2441afb2bd1cSJunchao Zhang #endif
24429566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
24439566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2444ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2445ccdfe979SStefano Zampini }
2446ccdfe979SStefano Zampini 
2447ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat, Mat, Mat, PetscBool, PetscBool);
2448ccdfe979SStefano Zampini 
2449*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2450*d71ae5a4SJacob Faibussowitsch {
2451ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2452ccdfe979SStefano Zampini   Mat                           A, B;
2453afb2bd1cSJunchao Zhang   PetscInt                      m, n, blda, clda;
2454ccdfe979SStefano Zampini   PetscBool                     flg, biscuda;
2455ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2456ccdfe979SStefano Zampini   cusparseStatus_t              stat;
2457ccdfe979SStefano Zampini   cusparseOperation_t           opA;
2458ccdfe979SStefano Zampini   const PetscScalar            *barray;
2459ccdfe979SStefano Zampini   PetscScalar                  *carray;
2460ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2461ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2462ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2463ccdfe979SStefano Zampini 
2464ccdfe979SStefano Zampini   PetscFunctionBegin;
2465ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
246628b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
2467ccdfe979SStefano Zampini   mmdata = (MatMatCusparse *)product->data;
2468ccdfe979SStefano Zampini   A      = product->A;
2469ccdfe979SStefano Zampini   B      = product->B;
24709566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
247128b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2472ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2473ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
247428b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
24759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2476ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2477ccdfe979SStefano Zampini   switch (product->type) {
2478ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2479ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2480ccdfe979SStefano Zampini     mat = cusp->mat;
2481ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2482ccdfe979SStefano Zampini     m   = A->rmap->n;
2483ccdfe979SStefano Zampini     n   = B->cmap->n;
2484ccdfe979SStefano Zampini     break;
2485ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
24861a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2487e6e9a74fSStefano Zampini       mat = cusp->mat;
2488e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2489e6e9a74fSStefano Zampini     } else {
24909566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2491ccdfe979SStefano Zampini       mat = cusp->matTranspose;
2492ccdfe979SStefano Zampini       opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2493e6e9a74fSStefano Zampini     }
2494ccdfe979SStefano Zampini     m = A->cmap->n;
2495ccdfe979SStefano Zampini     n = B->cmap->n;
2496ccdfe979SStefano Zampini     break;
2497ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2498ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2499ccdfe979SStefano Zampini     mat = cusp->mat;
2500ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2501ccdfe979SStefano Zampini     m   = A->rmap->n;
2502ccdfe979SStefano Zampini     n   = B->rmap->n;
2503ccdfe979SStefano Zampini     break;
2504*d71ae5a4SJacob Faibussowitsch   default:
2505*d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2506ccdfe979SStefano Zampini   }
250728b400f6SJacob Faibussowitsch   PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct");
2508ccdfe979SStefano Zampini   csrmat = (CsrMatrix *)mat->mat;
2509ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
25109566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda));
25119566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B));
25129566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B, &barray));
2513afb2bd1cSJunchao Zhang 
25149566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B, &blda));
2515c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
25169566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X, &carray));
25179566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X, &clda));
2518c8378d12SStefano Zampini   } else {
25199566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C, &carray));
25209566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C, &clda));
2521c8378d12SStefano Zampini   }
2522c8378d12SStefano Zampini 
25239566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2524afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2525afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2526a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2527afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2528fcdce8c4SStefano Zampini     size_t mmBufferSize;
25299371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Blda != blda) {
25309371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
25319371c9d4SSatish Balay       mmdata->matBDescr = NULL;
25329371c9d4SSatish Balay     }
2533afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
25349566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2535afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2536afb2bd1cSJunchao Zhang     }
2537c8378d12SStefano Zampini 
25389371c9d4SSatish Balay     if (mmdata->initialized && mmdata->Clda != clda) {
25399371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
25409371c9d4SSatish Balay       mmdata->matCDescr = NULL;
25419371c9d4SSatish Balay     }
2542afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
25439566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL));
2544afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2545afb2bd1cSJunchao Zhang     }
2546afb2bd1cSJunchao Zhang 
2547afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
25489371c9d4SSatish Balay       stat = cusparseCreateCsr(&mat->matDescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
25499371c9d4SSatish Balay                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
25509371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2551afb2bd1cSJunchao Zhang     }
25529371c9d4SSatish Balay     stat = cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize);
25539371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
2554fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
25559566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
25569566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize));
2557fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2558fcdce8c4SStefano Zampini     }
2559afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2560afb2bd1cSJunchao Zhang   } else {
2561afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
25629566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr, csrmat->values->data().get()));
25639566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray));
25649566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray));
2565afb2bd1cSJunchao Zhang   }
2566afb2bd1cSJunchao Zhang 
2567afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
25689371c9d4SSatish Balay   stat = cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, mat->matDescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer);
25699371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2570afb2bd1cSJunchao Zhang #else
2571afb2bd1cSJunchao Zhang   PetscInt k;
2572afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2573ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2574ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2575ccdfe979SStefano Zampini     cublasStatus_t cerr;
2576ccdfe979SStefano Zampini 
25779566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
25789371c9d4SSatish Balay     cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n);
25799371c9d4SSatish Balay     PetscCallCUBLAS(cerr);
2580ccdfe979SStefano Zampini     blda = B->cmap->n;
2581afb2bd1cSJunchao Zhang     k = B->cmap->n;
2582afb2bd1cSJunchao Zhang   } else {
2583afb2bd1cSJunchao Zhang     k = B->rmap->n;
2584ccdfe979SStefano Zampini   }
2585ccdfe979SStefano Zampini 
2586afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
25879371c9d4SSatish Balay   stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda);
25889371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2589afb2bd1cSJunchao Zhang #endif
25909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
25919566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries));
25929566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B, &barray));
2593ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
25949566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25959566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE));
2596ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
25979566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X, &carray));
25989566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE));
2599ccdfe979SStefano Zampini   } else {
26009566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C, &carray));
2601ccdfe979SStefano Zampini   }
260248a46eb9SPierre Jolivet   if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C));
260348a46eb9SPierre Jolivet   if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
2604ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2605ccdfe979SStefano Zampini }
2606ccdfe979SStefano Zampini 
2607*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2608*d71ae5a4SJacob Faibussowitsch {
2609ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2610ccdfe979SStefano Zampini   Mat                 A, B;
2611ccdfe979SStefano Zampini   PetscInt            m, n;
2612ccdfe979SStefano Zampini   PetscBool           cisdense, flg;
2613ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2614ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2615ccdfe979SStefano Zampini 
2616ccdfe979SStefano Zampini   PetscFunctionBegin;
2617ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
261828b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2619ccdfe979SStefano Zampini   A = product->A;
2620ccdfe979SStefano Zampini   B = product->B;
26219566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
262228b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
2623ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
262408401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2625ccdfe979SStefano Zampini   switch (product->type) {
2626ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2627ccdfe979SStefano Zampini     m = A->rmap->n;
2628ccdfe979SStefano Zampini     n = B->cmap->n;
2629ccdfe979SStefano Zampini     break;
2630ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2631ccdfe979SStefano Zampini     m = A->cmap->n;
2632ccdfe979SStefano Zampini     n = B->cmap->n;
2633ccdfe979SStefano Zampini     break;
2634ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2635ccdfe979SStefano Zampini     m = A->rmap->n;
2636ccdfe979SStefano Zampini     n = B->rmap->n;
2637ccdfe979SStefano Zampini     break;
2638ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2639ccdfe979SStefano Zampini     m = B->cmap->n;
2640ccdfe979SStefano Zampini     n = B->cmap->n;
2641ccdfe979SStefano Zampini     break;
2642ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2643ccdfe979SStefano Zampini     m = B->rmap->n;
2644ccdfe979SStefano Zampini     n = B->rmap->n;
2645ccdfe979SStefano Zampini     break;
2646*d71ae5a4SJacob Faibussowitsch   default:
2647*d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2648ccdfe979SStefano Zampini   }
26499566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
2650ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
26519566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense));
26529566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQDENSECUDA));
2653ccdfe979SStefano Zampini 
2654ccdfe979SStefano Zampini   /* product data */
26559566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2656ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2657afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0)
2658afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
265948a46eb9SPierre Jolivet   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar)));
2660afb2bd1cSJunchao Zhang #endif
2661ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2662ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
26639566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X));
26649566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA));
2665ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
26669566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n));
2667ccdfe979SStefano Zampini     } else {
26689566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n));
2669ccdfe979SStefano Zampini     }
2670ccdfe979SStefano Zampini   }
2671ccdfe979SStefano Zampini   C->product->data    = mmdata;
2672ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2673ccdfe979SStefano Zampini 
2674ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2675ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2676ccdfe979SStefano Zampini }
2677ccdfe979SStefano Zampini 
2678*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2679*d71ae5a4SJacob Faibussowitsch {
2680ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2681fcdce8c4SStefano Zampini   Mat                           A, B;
2682fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2683fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ *)C->data;
2684fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2685fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2686fcdce8c4SStefano Zampini   PetscBool                     flg;
2687fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2688fcdce8c4SStefano Zampini   MatProductType                ptype;
2689fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2690fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2691fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2692fcdce8c4SStefano Zampini #endif
2693b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2694ccdfe979SStefano Zampini 
2695ccdfe979SStefano Zampini   PetscFunctionBegin;
2696ccdfe979SStefano Zampini   MatCheckProduct(C, 1);
269728b400f6SJacob Faibussowitsch   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty");
26989566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg));
269928b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name);
2700fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse *)C->product->data;
2701fcdce8c4SStefano Zampini   A      = product->A;
2702fcdce8c4SStefano Zampini   B      = product->B;
2703fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2704fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2705fcdce8c4SStefano Zampini     Ccusp            = (Mat_SeqAIJCUSPARSE *)C->spptr;
270608401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2707fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
270828b400f6SJacob Faibussowitsch     PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]);
2709fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix *)Cmat->mat;
271028b400f6SJacob Faibussowitsch     PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
2711fcdce8c4SStefano Zampini     goto finalize;
2712fcdce8c4SStefano Zampini   }
2713fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
27149566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
271528b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
27169566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
271728b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
271828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
271928b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2720fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
2721fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
2722fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
272308401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
272408401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
272508401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
27269566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
27279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2728fcdce8c4SStefano Zampini 
2729fcdce8c4SStefano Zampini   ptype = product->type;
2730b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2731fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
273228b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric");
2733fa046f9fSJunchao Zhang   }
2734b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2735fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
273628b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric");
2737fa046f9fSJunchao Zhang   }
2738fcdce8c4SStefano Zampini   switch (ptype) {
2739fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2740fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2741fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2742fcdce8c4SStefano Zampini     break;
2743fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2744fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2745fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2746fcdce8c4SStefano Zampini     break;
2747fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2748fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2749fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2750fcdce8c4SStefano Zampini     break;
2751*d71ae5a4SJacob Faibussowitsch   default:
2752*d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2753fcdce8c4SStefano Zampini   }
2754fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
275528b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
275628b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
275728b400f6SJacob Faibussowitsch   PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]);
2758fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2759fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */
2760fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix *)Cmat->mat;
276128b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
276228b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
276328b400f6SJacob Faibussowitsch   PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct");
27649566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2765fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2766fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
27679566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2768b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
27699371c9d4SSatish Balay   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
27709371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2771b4285af6SJunchao Zhang   #else
27729371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
27739371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
27749371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
27759371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2776b4285af6SJunchao Zhang   #endif
2777fcdce8c4SStefano Zampini #else
27789371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
27799371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
27809371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
2781fcdce8c4SStefano Zampini #endif
27829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
27839566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
27849566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2785fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2786fcdce8c4SStefano Zampini finalize:
2787fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
27889566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
27899566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
27909566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
2791fcdce8c4SStefano Zampini   c->reallocs = 0;
2792fcdce8c4SStefano Zampini   C->info.mallocs += 0;
2793fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2794fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2795fcdce8c4SStefano Zampini   C->num_ass++;
2796ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2797ccdfe979SStefano Zampini }
2798fcdce8c4SStefano Zampini 
2799*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2800*d71ae5a4SJacob Faibussowitsch {
2801fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2802fcdce8c4SStefano Zampini   Mat                           A, B;
2803fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp, *Bcusp, *Ccusp;
2804fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a, *b, *c;
2805fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat;
2806fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
2807fcdce8c4SStefano Zampini   PetscInt                      i, j, m, n, k;
2808fcdce8c4SStefano Zampini   PetscBool                     flg;
2809fcdce8c4SStefano Zampini   cusparseStatus_t              stat;
2810fcdce8c4SStefano Zampini   MatProductType                ptype;
2811fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2812fcdce8c4SStefano Zampini   PetscLogDouble                flops;
2813fcdce8c4SStefano Zampini   PetscBool                     biscompressed, ciscompressed;
2814fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2815fcdce8c4SStefano Zampini   int64_t              C_num_rows1, C_num_cols1, C_nnz1;
2816fcdce8c4SStefano Zampini   cusparseSpMatDescr_t BmatSpDescr;
2817fcdce8c4SStefano Zampini #else
2818fcdce8c4SStefano Zampini   int cnz;
2819fcdce8c4SStefano Zampini #endif
2820b4285af6SJunchao Zhang   cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2821fcdce8c4SStefano Zampini 
2822fcdce8c4SStefano Zampini   PetscFunctionBegin;
2823fcdce8c4SStefano Zampini   MatCheckProduct(C, 1);
282428b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty");
2825fcdce8c4SStefano Zampini   A = product->A;
2826fcdce8c4SStefano Zampini   B = product->B;
28279566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg));
282828b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name);
28299566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg));
283028b400f6SJacob Faibussowitsch   PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name);
2831fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ *)A->data;
2832fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ *)B->data;
2833fcdce8c4SStefano Zampini   /* product data */
28349566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2835fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2836fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2837fcdce8c4SStefano Zampini 
28389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
28399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2840d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2841d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr;
284208401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
284308401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format");
2844d60bce21SJunchao Zhang 
2845fcdce8c4SStefano Zampini   ptype = product->type;
2846b94d7dedSBarry Smith   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
2847fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2848fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2849fa046f9fSJunchao Zhang   }
2850b94d7dedSBarry Smith   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
2851fa046f9fSJunchao Zhang     ptype                                          = MATPRODUCT_AB;
2852fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2853fa046f9fSJunchao Zhang   }
2854fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2855fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2856fcdce8c4SStefano Zampini   switch (ptype) {
2857fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2858fcdce8c4SStefano Zampini     m    = A->rmap->n;
2859fcdce8c4SStefano Zampini     n    = B->cmap->n;
2860fcdce8c4SStefano Zampini     k    = A->cmap->n;
2861fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2862fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2863fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2864fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2865fcdce8c4SStefano Zampini     break;
2866fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2867fcdce8c4SStefano Zampini     m = A->cmap->n;
2868fcdce8c4SStefano Zampini     n = B->cmap->n;
2869fcdce8c4SStefano Zampini     k = A->rmap->n;
28709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2871fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2872fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2873fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2874fcdce8c4SStefano Zampini     break;
2875fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2876fcdce8c4SStefano Zampini     m = A->rmap->n;
2877fcdce8c4SStefano Zampini     n = B->rmap->n;
2878fcdce8c4SStefano Zampini     k = A->cmap->n;
28799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2880fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2881fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2882fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2883fcdce8c4SStefano Zampini     break;
2884*d71ae5a4SJacob Faibussowitsch   default:
2885*d71ae5a4SJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]);
2886fcdce8c4SStefano Zampini   }
2887fcdce8c4SStefano Zampini 
2888fcdce8c4SStefano Zampini   /* create cusparse matrix */
28899566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C, m, n, m, n));
28909566063dSJacob Faibussowitsch   PetscCall(MatSetType(C, MATSEQAIJCUSPARSE));
2891fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ *)C->data;
2892fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr;
2893fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2894fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2895fcdce8c4SStefano Zampini 
2896fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2897fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2898fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
28999566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex));
29009566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows));
2901fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2902fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2903fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows);
2904fcdce8c4SStefano Zampini   } else {
2905fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2906fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2907fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2908fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2909fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2910fcdce8c4SStefano Zampini   }
2911fcdce8c4SStefano Zampini   Ccusp->nrows      = ciscompressed ? c->compressedrow.nrows : m;
2912fcdce8c4SStefano Zampini   Ccusp->mat        = Cmat;
2913fcdce8c4SStefano Zampini   Ccusp->mat->mat   = Ccsr;
2914fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2915fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2916fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1);
29179566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
29189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
29199566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
29209566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
29219566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
29229566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
29239566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
29249566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
29259566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
2926fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2927fcdce8c4SStefano Zampini     thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0);
2928fcdce8c4SStefano Zampini     c->nz                = 0;
2929fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2930fcdce8c4SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
2931fcdce8c4SStefano Zampini     goto finalizesym;
2932fcdce8c4SStefano Zampini   }
2933fcdce8c4SStefano Zampini 
293428b400f6SJacob Faibussowitsch   PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]);
293528b400f6SJacob Faibussowitsch   PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]);
2936fcdce8c4SStefano Zampini   Acsr = (CsrMatrix *)Amat->mat;
2937fcdce8c4SStefano Zampini   if (!biscompressed) {
2938fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix *)Bmat->mat;
2939fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2940fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2941fcdce8c4SStefano Zampini #endif
2942fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2943fcdce8c4SStefano Zampini     CsrMatrix *cBcsr     = (CsrMatrix *)Bmat->mat;
2944fcdce8c4SStefano Zampini     Bcsr                 = new CsrMatrix;
2945fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2946fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2947fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2948fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2949fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2950fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2951fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
2952fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
29539566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
2954fcdce8c4SStefano Zampini     }
2955fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2956fcdce8c4SStefano Zampini     mmdata->Bcsr      = Bcsr;
2957fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
2958fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
29599371c9d4SSatish Balay       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
29609371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
2961fcdce8c4SStefano Zampini     }
2962fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2963fcdce8c4SStefano Zampini #endif
2964fcdce8c4SStefano Zampini   }
296528b400f6SJacob Faibussowitsch   PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct");
296628b400f6SJacob Faibussowitsch   PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct");
2967fcdce8c4SStefano Zampini   /* precompute flops count */
2968fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2969fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2970fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2971fcdce8c4SStefano Zampini       const PetscInt en = a->i[i + 1];
2972fcdce8c4SStefano Zampini       for (j = st; j < en; j++) {
2973fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2974fcdce8c4SStefano Zampini         flops += 2. * (b->i[brow + 1] - b->i[brow]);
2975fcdce8c4SStefano Zampini       }
2976fcdce8c4SStefano Zampini     }
2977fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2978fcdce8c4SStefano Zampini     for (i = 0, flops = 0; i < A->rmap->n; i++) {
2979fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i + 1] - a->i[i];
2980fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i + 1] - b->i[i];
2981fcdce8c4SStefano Zampini       flops += (2. * anzi) * bnzi;
2982fcdce8c4SStefano Zampini     }
2983fcdce8c4SStefano Zampini   } else { /* TODO */
2984fcdce8c4SStefano Zampini     flops = 0.;
2985fcdce8c4SStefano Zampini   }
2986fcdce8c4SStefano Zampini 
2987fcdce8c4SStefano Zampini   mmdata->flops = flops;
29889566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2989b4285af6SJunchao Zhang 
2990fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
29919566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
29929371c9d4SSatish Balay   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, NULL, NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
29939371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
29949566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2995b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0)
2996b4285af6SJunchao Zhang   {
2997b4285af6SJunchao Zhang     /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2998b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2999b4285af6SJunchao Zhang   */
3000b4285af6SJunchao Zhang     void *dBuffer1 = NULL;
3001b4285af6SJunchao Zhang     void *dBuffer2 = NULL;
3002b4285af6SJunchao Zhang     void *dBuffer3 = NULL;
3003b4285af6SJunchao Zhang     /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3004b4285af6SJunchao Zhang     size_t bufferSize1 = 0;
3005b4285af6SJunchao Zhang     size_t bufferSize2 = 0;
3006b4285af6SJunchao Zhang     size_t bufferSize3 = 0;
3007b4285af6SJunchao Zhang     size_t bufferSize4 = 0;
3008b4285af6SJunchao Zhang     size_t bufferSize5 = 0;
3009b4285af6SJunchao Zhang 
3010b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
3011b4285af6SJunchao Zhang     /* ask bufferSize1 bytes for external memory */
30129371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL);
30139371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
30149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1));
3015b4285af6SJunchao Zhang     /* inspect the matrices A and B to understand the memory requirement for the next step */
30169371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1);
30179371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3018b4285af6SJunchao Zhang 
3019b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
30209371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);
30219371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
30229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2));
30239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3));
30249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4));
30259371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);
30269371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
30279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer1));
30289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer2));
3029b4285af6SJunchao Zhang 
3030b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
3031b4285af6SJunchao Zhang     /* get matrix C non-zero entries C_nnz1 */
30329566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3033b4285af6SJunchao Zhang     c->nz = (PetscInt)C_nnz1;
3034b4285af6SJunchao Zhang     /* allocate matrix C */
30359371c9d4SSatish Balay     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30369371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
30379371c9d4SSatish Balay     Ccsr->values = new THRUSTARRAY(c->nz);
30389371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3039b4285af6SJunchao Zhang     /* update matC with the new pointers */
30409371c9d4SSatish Balay     stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
30419371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
3042b4285af6SJunchao Zhang 
3043b4285af6SJunchao Zhang     /*----------------------------------------------------------------------*/
30449371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL);
30459371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
30469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5));
30479371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5);
30489371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
30499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(dBuffer3));
30509371c9d4SSatish Balay     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30519371c9d4SSatish Balay     PetscCallCUSPARSE(stat);
30529566063dSJacob Faibussowitsch     PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024));
3053b4285af6SJunchao Zhang   }
3054ae37ee31SJunchao Zhang   #else
3055b4285af6SJunchao Zhang   size_t bufSize2;
3056fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
30579371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL);
30589371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
30599566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2));
3060fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
30619371c9d4SSatish Balay   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);
30629371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3063fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
30649371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);
30659371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3066fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3067fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3068fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3069fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3070fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
30719566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize));
3072fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
30739371c9d4SSatish Balay   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);
30749371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3075fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
30769566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3077fcdce8c4SStefano Zampini   c->nz = (PetscInt)C_nnz1;
30789371c9d4SSatish Balay   PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024,
30799371c9d4SSatish Balay                       mmdata->mmBufferSize / 1024));
3080fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30819566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3082fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30839566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
30849371c9d4SSatish Balay   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get());
30859371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
30869371c9d4SSatish Balay   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);
30879371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3088ae37ee31SJunchao Zhang   #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3089fcdce8c4SStefano Zampini #else
30909566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
30919371c9d4SSatish Balay   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
30929371c9d4SSatish Balay                              Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);
30939371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3094fcdce8c4SStefano Zampini   c->nz = cnz;
3095fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
30969566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3097fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
30989566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3099fcdce8c4SStefano Zampini 
31009566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3101fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3102fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3103fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
31049371c9d4SSatish Balay   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries,
31059371c9d4SSatish Balay                              Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());
31069371c9d4SSatish Balay   PetscCallCUSPARSE(stat);
3107fcdce8c4SStefano Zampini #endif
31089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
31099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3110fcdce8c4SStefano Zampini finalizesym:
3111fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3112fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3113fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
31149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m + 1, &c->i));
31159566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->j));
3116fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3117fcdce8c4SStefano Zampini     PetscInt      *d_i = c->i;
3118fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3119fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3120fcdce8c4SStefano Zampini     ii = *Ccsr->row_offsets;
3121fcdce8c4SStefano Zampini     jj = *Ccsr->column_indices;
3122fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
31239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
31249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3125fcdce8c4SStefano Zampini   } else {
3126fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3127fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
31289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
31299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
3130fcdce8c4SStefano Zampini   }
3131fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3132fcdce8c4SStefano Zampini     PetscInt r = 0;
3133fcdce8c4SStefano Zampini     c->i[0]    = 0;
3134fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3135fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3136fcdce8c4SStefano Zampini       const PetscInt old  = c->compressedrow.i[k];
3137fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r + 1] = old;
3138fcdce8c4SStefano Zampini     }
3139fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows];
3140fcdce8c4SStefano Zampini   }
31419566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
31429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->ilen));
31439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m, &c->imax));
3144fcdce8c4SStefano Zampini   c->maxnz         = c->nz;
3145fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3146fcdce8c4SStefano Zampini   c->rmax          = 0;
3147fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3148fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k + 1] - c->i[k];
3149fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3150fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt) !!nn;
3151fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax, nn);
3152fcdce8c4SStefano Zampini   }
31539566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
31549566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz, &c->a));
3155fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3156fcdce8c4SStefano Zampini 
3157fcdce8c4SStefano Zampini   C->nonzerostate++;
31589566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
31599566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3160fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3161fcdce8c4SStefano Zampini   C->offloadmask      = PETSC_OFFLOAD_UNALLOCATED;
3162fcdce8c4SStefano Zampini   C->preallocated     = PETSC_TRUE;
3163fcdce8c4SStefano Zampini   C->assembled        = PETSC_FALSE;
3164fcdce8c4SStefano Zampini   C->was_assembled    = PETSC_FALSE;
3165abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3166fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3167fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3168fcdce8c4SStefano Zampini   }
3169fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3170fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
3171fcdce8c4SStefano Zampini }
3172fcdce8c4SStefano Zampini 
3173fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3174fcdce8c4SStefano Zampini 
3175fcdce8c4SStefano Zampini /* handles sparse or dense B */
3176*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3177*d71ae5a4SJacob Faibussowitsch {
3178fcdce8c4SStefano Zampini   Mat_Product *product = mat->product;
3179fcdce8c4SStefano Zampini   PetscBool    isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE;
3180fcdce8c4SStefano Zampini 
3181fcdce8c4SStefano Zampini   PetscFunctionBegin;
3182fcdce8c4SStefano Zampini   MatCheckProduct(mat, 1);
31839566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense));
318448a46eb9SPierre Jolivet   if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp));
3185fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3186fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
318748a46eb9SPierre Jolivet     if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp));
3188fcdce8c4SStefano Zampini   }
318965e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
319065e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
319165e4b4d4SStefano Zampini     switch (product->type) {
319265e4b4d4SStefano Zampini     case MATPRODUCT_AB:
319365e4b4d4SStefano Zampini       if (product->api_user) {
3194d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat");
31959566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3196d0609cedSBarry Smith         PetscOptionsEnd();
319765e4b4d4SStefano Zampini       } else {
3198d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat");
31999566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL));
3200d0609cedSBarry Smith         PetscOptionsEnd();
320165e4b4d4SStefano Zampini       }
320265e4b4d4SStefano Zampini       break;
320365e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
320465e4b4d4SStefano Zampini       if (product->api_user) {
3205d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat");
32069566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3207d0609cedSBarry Smith         PetscOptionsEnd();
320865e4b4d4SStefano Zampini       } else {
3209d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat");
32109566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL));
3211d0609cedSBarry Smith         PetscOptionsEnd();
321265e4b4d4SStefano Zampini       }
321365e4b4d4SStefano Zampini       break;
321465e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
321565e4b4d4SStefano Zampini       if (product->api_user) {
3216d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat");
32179566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3218d0609cedSBarry Smith         PetscOptionsEnd();
321965e4b4d4SStefano Zampini       } else {
3220d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat");
32219566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL));
3222d0609cedSBarry Smith         PetscOptionsEnd();
322365e4b4d4SStefano Zampini       }
322465e4b4d4SStefano Zampini       break;
322565e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
322665e4b4d4SStefano Zampini       if (product->api_user) {
3227d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat");
32289566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3229d0609cedSBarry Smith         PetscOptionsEnd();
323065e4b4d4SStefano Zampini       } else {
3231d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat");
32329566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL));
3233d0609cedSBarry Smith         PetscOptionsEnd();
323465e4b4d4SStefano Zampini       }
323565e4b4d4SStefano Zampini       break;
323665e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
323765e4b4d4SStefano Zampini       if (product->api_user) {
3238d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat");
32399566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3240d0609cedSBarry Smith         PetscOptionsEnd();
324165e4b4d4SStefano Zampini       } else {
3242d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat");
32439566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL));
3244d0609cedSBarry Smith         PetscOptionsEnd();
324565e4b4d4SStefano Zampini       }
324665e4b4d4SStefano Zampini       break;
3247*d71ae5a4SJacob Faibussowitsch     default:
3248*d71ae5a4SJacob Faibussowitsch       break;
324965e4b4d4SStefano Zampini     }
325065e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
325165e4b4d4SStefano Zampini   }
325265e4b4d4SStefano Zampini   /* dispatch */
3253fcdce8c4SStefano Zampini   if (isdense) {
3254ccdfe979SStefano Zampini     switch (product->type) {
3255ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3256ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3257ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3258ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3259ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3260fcdce8c4SStefano Zampini       if (product->A->boundtocpu) {
32619566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3262fcdce8c4SStefano Zampini       } else {
3263fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3264fcdce8c4SStefano Zampini       }
3265fcdce8c4SStefano Zampini       break;
3266*d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3267*d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3268*d71ae5a4SJacob Faibussowitsch       break;
3269*d71ae5a4SJacob Faibussowitsch     default:
3270*d71ae5a4SJacob Faibussowitsch       break;
3271ccdfe979SStefano Zampini     }
3272fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3273fcdce8c4SStefano Zampini     switch (product->type) {
3274fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3275fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3276*d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABt:
3277*d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3278*d71ae5a4SJacob Faibussowitsch       break;
3279fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3280fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3281*d71ae5a4SJacob Faibussowitsch     case MATPRODUCT_ABC:
3282*d71ae5a4SJacob Faibussowitsch       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3283*d71ae5a4SJacob Faibussowitsch       break;
3284*d71ae5a4SJacob Faibussowitsch     default:
3285*d71ae5a4SJacob Faibussowitsch       break;
3286fcdce8c4SStefano Zampini     }
3287fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
32889566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3289fcdce8c4SStefano Zampini   }
3290ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3291ccdfe979SStefano Zampini }
3292ccdfe979SStefano Zampini 
3293*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3294*d71ae5a4SJacob Faibussowitsch {
32959ae82921SPaul Mullowney   PetscFunctionBegin;
32969566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE));
3297e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3298e6e9a74fSStefano Zampini }
3299e6e9a74fSStefano Zampini 
3300*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3301*d71ae5a4SJacob Faibussowitsch {
3302e6e9a74fSStefano Zampini   PetscFunctionBegin;
33039566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE));
3304e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3305e6e9a74fSStefano Zampini }
3306e6e9a74fSStefano Zampini 
3307*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3308*d71ae5a4SJacob Faibussowitsch {
3309e6e9a74fSStefano Zampini   PetscFunctionBegin;
33109566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE));
3311e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3312e6e9a74fSStefano Zampini }
3313e6e9a74fSStefano Zampini 
3314*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3315*d71ae5a4SJacob Faibussowitsch {
3316e6e9a74fSStefano Zampini   PetscFunctionBegin;
33179566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE));
33189ae82921SPaul Mullowney   PetscFunctionReturn(0);
33199ae82921SPaul Mullowney }
33209ae82921SPaul Mullowney 
3321*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy)
3322*d71ae5a4SJacob Faibussowitsch {
3323ca45077fSPaul Mullowney   PetscFunctionBegin;
33249566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE));
3325ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3326ca45077fSPaul Mullowney }
3327ca45077fSPaul Mullowney 
3328*d71ae5a4SJacob Faibussowitsch __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y)
3329*d71ae5a4SJacob Faibussowitsch {
3330a0e72f99SJunchao Zhang   int i = blockIdx.x * blockDim.x + threadIdx.x;
3331a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3332a0e72f99SJunchao Zhang }
3333a0e72f99SJunchao Zhang 
3334afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3335*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm)
3336*d71ae5a4SJacob Faibussowitsch {
33379ae82921SPaul Mullowney   Mat_SeqAIJ                   *a              = (Mat_SeqAIJ *)A->data;
3338aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr;
33399ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3340e6e9a74fSStefano Zampini   PetscScalar                  *xarray, *zarray, *dptr, *beta, *xptr;
3341e6e9a74fSStefano Zampini   cusparseOperation_t           opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3342e6e9a74fSStefano Zampini   PetscBool                     compressed;
3343afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3344afb2bd1cSJunchao Zhang   PetscInt nx, ny;
3345afb2bd1cSJunchao Zhang #endif
33466e111a19SKarl Rupp 
33479ae82921SPaul Mullowney   PetscFunctionBegin;
334808401ef6SPierre Jolivet   PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported");
3349cbc6b225SStefano Zampini   if (!a->nz) {
33509566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz, 0));
33519566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy, zz));
3352e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3353e6e9a74fSStefano Zampini   }
335434d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
33559566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3356e6e9a74fSStefano Zampini   if (!trans) {
33579ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
33585f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3359e6e9a74fSStefano Zampini   } else {
33601a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3361e6e9a74fSStefano Zampini       opA       = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3362e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat;
3363e6e9a74fSStefano Zampini     } else {
33649566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3365e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose;
3366e6e9a74fSStefano Zampini     }
3367e6e9a74fSStefano Zampini   }
3368e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3369e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3370213423ffSJunchao Zhang 
3371e6e9a74fSStefano Zampini   try {
33729566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray));
33739566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
33749566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz, &zarray));     /* write zz, so no need to init zarray on GPU */
3375afb2bd1cSJunchao Zhang 
33769566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3377e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3378afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3379afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3380afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3381afb2bd1cSJunchao Zhang       */
3382e6e9a74fSStefano Zampini       xptr = xarray;
3383afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3384213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3385afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3386afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3387afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3388afb2bd1cSJunchao Zhang        */
3389afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3390afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3391afb2bd1cSJunchao Zhang         nx             = mat->num_cols;
3392afb2bd1cSJunchao Zhang         ny             = mat->num_rows;
3393afb2bd1cSJunchao Zhang       }
3394afb2bd1cSJunchao Zhang #endif
3395e6e9a74fSStefano Zampini     } else {
3396afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3397afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3398afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3399afb2bd1cSJunchao Zhang        */
3400afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3401e6e9a74fSStefano Zampini       dptr = zarray;
3402e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3403afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3404e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3405d0967f54SJacob Faibussowitsch 
3406d0967f54SJacob Faibussowitsch         thrust::for_each(
3407d0967f54SJacob Faibussowitsch #if PetscDefined(HAVE_THRUST_ASYNC)
3408d0967f54SJacob Faibussowitsch           thrust::cuda::par.on(PetscDefaultCudaStream),
3409d0967f54SJacob Faibussowitsch #endif
3410d0967f54SJacob Faibussowitsch           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
34119371c9d4SSatish Balay           thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse());
3412e6e9a74fSStefano Zampini       }
3413afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3414afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3415afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
3416afb2bd1cSJunchao Zhang         nx             = mat->num_rows;
3417afb2bd1cSJunchao Zhang         ny             = mat->num_cols;
3418afb2bd1cSJunchao Zhang       }
3419afb2bd1cSJunchao Zhang #endif
3420e6e9a74fSStefano Zampini     }
34219ae82921SPaul Mullowney 
3422afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3423aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3424afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
34255f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3426afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
34279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype));
34289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype));
34299371c9d4SSatish Balay         PetscCallCUSPARSE(
34309371c9d4SSatish Balay           cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize));
34319566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize));
3432afb2bd1cSJunchao Zhang 
3433afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3434afb2bd1cSJunchao Zhang       } else {
3435afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
34369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr));
34379566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr));
3438afb2bd1cSJunchao Zhang       }
3439afb2bd1cSJunchao Zhang 
34409371c9d4SSatish Balay       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
34419371c9d4SSatish Balay                                      matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer));
3442afb2bd1cSJunchao Zhang #else
34437656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix *)matstruct->mat;
34449371c9d4SSatish Balay       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr));
3445afb2bd1cSJunchao Zhang #endif
3446aa372e3fSPaul Mullowney     } else {
3447213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3448afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3449afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3450afb2bd1cSJunchao Zhang #else
3451301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
34529371c9d4SSatish Balay         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr));
3453afb2bd1cSJunchao Zhang #endif
3454a65300a6SPaul Mullowney       }
3455aa372e3fSPaul Mullowney     }
34569566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3457aa372e3fSPaul Mullowney 
3458e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3459213423ffSJunchao Zhang       if (yy) {                                    /* MatMultAdd: zz = A*xx + yy */
3460213423ffSJunchao Zhang         if (compressed) {                          /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
34619566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy, zz));      /* zz = yy */
3462e6e9a74fSStefano Zampini         } else if (zz != yy) {                     /* A is not compressed. zz already contains A*xx, and we just need to add yy */
34639566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */
34647656d835SStefano Zampini         }
3465213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
34669566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz, 0));
34677656d835SStefano Zampini       }
34687656d835SStefano Zampini 
3469213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3470213423ffSJunchao Zhang       if (compressed) {
34719566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3472a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3473a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3474a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3475a0e72f99SJunchao Zhang          */
3476a0e72f99SJunchao Zhang #if 0
3477a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3478a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3479a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3480e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3481c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3482a0e72f99SJunchao Zhang #else
3483a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3484a0e72f99SJunchao Zhang         ScatterAdd<<<(n + 255) / 256, 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray);
3485a0e72f99SJunchao Zhang #endif
34869566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3487e6e9a74fSStefano Zampini       }
3488e6e9a74fSStefano Zampini     } else {
34899371c9d4SSatish Balay       if (yy && yy != zz) { PetscCall(VecAXPY_SeqCUDA(zz, 1.0, yy)); /* zz += yy */ }
3490e6e9a74fSStefano Zampini     }
34919566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray));
34929566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray));
34939566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray));
3494*d71ae5a4SJacob Faibussowitsch   } catch (char *ex) {
3495*d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex);
3496*d71ae5a4SJacob Faibussowitsch   }
3497e6e9a74fSStefano Zampini   if (yy) {
34989566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz));
3499e6e9a74fSStefano Zampini   } else {
35009566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt));
3501e6e9a74fSStefano Zampini   }
35029ae82921SPaul Mullowney   PetscFunctionReturn(0);
35039ae82921SPaul Mullowney }
35049ae82921SPaul Mullowney 
3505*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
3506*d71ae5a4SJacob Faibussowitsch {
3507ca45077fSPaul Mullowney   PetscFunctionBegin;
35089566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE));
3509ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3510ca45077fSPaul Mullowney }
3511ca45077fSPaul Mullowney 
3512*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode)
3513*d71ae5a4SJacob Faibussowitsch {
3514042217e8SBarry Smith   PetscObjectState    onnz = A->nonzerostate;
3515042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
35163fa6b06aSMark Adams 
3517042217e8SBarry Smith   PetscFunctionBegin;
35189566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
3519042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
35209566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
35219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3522042217e8SBarry Smith     cusp->deviceMat = NULL;
3523042217e8SBarry Smith   }
35249ae82921SPaul Mullowney   PetscFunctionReturn(0);
35259ae82921SPaul Mullowney }
35269ae82921SPaul Mullowney 
35279ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3528e057df02SPaul Mullowney /*@
352911a5261eSBarry Smith    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
3530e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
353111a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
3532e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3533e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3534e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
35359ae82921SPaul Mullowney 
3536d083f849SBarry Smith    Collective
35379ae82921SPaul Mullowney 
35389ae82921SPaul Mullowney    Input Parameters:
353911a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
35409ae82921SPaul Mullowney .  m - number of rows
35419ae82921SPaul Mullowney .  n - number of columns
35429ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
35439ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
35440298fd71SBarry Smith          (possibly different for each row) or NULL
35459ae82921SPaul Mullowney 
35469ae82921SPaul Mullowney    Output Parameter:
35479ae82921SPaul Mullowney .  A - the matrix
35489ae82921SPaul Mullowney 
354911a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
35509ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
355111a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
35529ae82921SPaul Mullowney 
35539ae82921SPaul Mullowney    Notes:
35549ae82921SPaul Mullowney    If nnz is given then nz is ignored
35559ae82921SPaul Mullowney 
355611a5261eSBarry Smith    The AIJ format, also called
355711a5261eSBarry Smith    compressed row storage, is fully compatible with standard Fortran 77
35589ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
35599ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
35609ae82921SPaul Mullowney 
35619ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
356211a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
35639ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
35649ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
35659ae82921SPaul Mullowney 
35669ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
35679ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
35689ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
35699ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
35709ae82921SPaul Mullowney 
35719ae82921SPaul Mullowney    Level: intermediate
35729ae82921SPaul Mullowney 
357311a5261eSBarry Smith .seealso: `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
35749ae82921SPaul Mullowney @*/
3575*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
3576*d71ae5a4SJacob Faibussowitsch {
35779ae82921SPaul Mullowney   PetscFunctionBegin;
35789566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
35799566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, m, n));
35809566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
35819566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
35829ae82921SPaul Mullowney   PetscFunctionReturn(0);
35839ae82921SPaul Mullowney }
35849ae82921SPaul Mullowney 
3585*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
3586*d71ae5a4SJacob Faibussowitsch {
35879ae82921SPaul Mullowney   PetscFunctionBegin;
35889ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
35899566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE **)&A->spptr));
35909ae82921SPaul Mullowney   } else {
35919566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr));
3592aa372e3fSPaul Mullowney   }
35939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
35949566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
35959566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL));
35969566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
35979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
35989566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
35999566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
36009566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
36019566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
36029566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL));
36039566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
36049ae82921SPaul Mullowney   PetscFunctionReturn(0);
36059ae82921SPaul Mullowney }
36069ae82921SPaul Mullowney 
3607ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *);
360895639643SRichard Tran Mills static PetscErrorCode       MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool);
3609*d71ae5a4SJacob Faibussowitsch static PetscErrorCode       MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B)
3610*d71ae5a4SJacob Faibussowitsch {
36119ff858a8SKarl Rupp   PetscFunctionBegin;
36129566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B));
36139566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B));
36149ff858a8SKarl Rupp   PetscFunctionReturn(0);
36159ff858a8SKarl Rupp }
36169ff858a8SKarl Rupp 
3617*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str)
3618*d71ae5a4SJacob Faibussowitsch {
3619a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
3620039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3621039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3622039c6fbaSStefano Zampini   PetscScalar        *ay;
3623039c6fbaSStefano Zampini   const PetscScalar  *ax;
3624039c6fbaSStefano Zampini   CsrMatrix          *csry, *csrx;
3625e6e9a74fSStefano Zampini 
362695639643SRichard Tran Mills   PetscFunctionBegin;
3627a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE *)Y->spptr;
3628a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE *)X->spptr;
3629039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
36309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
36319566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3632a587d139SMark     PetscFunctionReturn(0);
363395639643SRichard Tran Mills   }
3634039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
36359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
36369566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
36375f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
36385f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported");
3639039c6fbaSStefano Zampini   csry = (CsrMatrix *)cy->mat->mat;
3640039c6fbaSStefano Zampini   csrx = (CsrMatrix *)cx->mat->mat;
3641039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3642039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3643039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin());
3644ad540459SPierre Jolivet     if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin());
3645039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3646039c6fbaSStefano Zampini   }
3647d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3648d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3649039c6fbaSStefano Zampini 
3650039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3651039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3652039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3653039c6fbaSStefano Zampini     size_t bufferSize;
3654039c6fbaSStefano Zampini     void  *buffer;
3655039c6fbaSStefano Zampini #endif
3656039c6fbaSStefano Zampini 
36579566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
36589566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
36599566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3660039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
36619371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
36629371c9d4SSatish Balay                                                      csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize));
36639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer, bufferSize));
36649566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
36659371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
36669371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer));
36679566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
36689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
36699566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3670039c6fbaSStefano Zampini #else
36719566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
36729371c9d4SSatish Balay     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(),
36739371c9d4SSatish Balay                                           csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get()));
36749566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
36759566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3676039c6fbaSStefano Zampini #endif
36779566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
36789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
36799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36809566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3681039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3682a587d139SMark     cublasHandle_t cublasv2handle;
3683a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3684039c6fbaSStefano Zampini 
36859566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax));
36869566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
36879566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
36889566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz, &bnz));
36899566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
36909566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one));
36919566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0 * bnz));
36929566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
36939566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax));
36949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
36959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3696039c6fbaSStefano Zampini   } else {
36979566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE));
36989566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y, a, X, str));
3699a587d139SMark   }
370095639643SRichard Tran Mills   PetscFunctionReturn(0);
370195639643SRichard Tran Mills }
370295639643SRichard Tran Mills 
3703*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a)
3704*d71ae5a4SJacob Faibussowitsch {
370533c9ba73SStefano Zampini   Mat_SeqAIJ    *y = (Mat_SeqAIJ *)Y->data;
370633c9ba73SStefano Zampini   PetscScalar   *ay;
370733c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
370833c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
370933c9ba73SStefano Zampini 
371033c9ba73SStefano Zampini   PetscFunctionBegin;
37119566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay));
37129566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
37139566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz, &bnz));
37149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
37159566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one));
37169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
37179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
37189566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay));
37199566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
372033c9ba73SStefano Zampini   PetscFunctionReturn(0);
372133c9ba73SStefano Zampini }
372233c9ba73SStefano Zampini 
3723*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
3724*d71ae5a4SJacob Faibussowitsch {
37257e8381f9SStefano Zampini   PetscBool   both = PETSC_FALSE;
3726a587d139SMark   Mat_SeqAIJ *a    = (Mat_SeqAIJ *)A->data;
37277e8381f9SStefano Zampini 
37283fa6b06aSMark Adams   PetscFunctionBegin;
37293fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
37303fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr;
37317e8381f9SStefano Zampini     if (spptr->mat) {
37327e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat;
37337e8381f9SStefano Zampini       if (matrix->values) {
37347e8381f9SStefano Zampini         both = PETSC_TRUE;
37357e8381f9SStefano Zampini         thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
37367e8381f9SStefano Zampini       }
37377e8381f9SStefano Zampini     }
37387e8381f9SStefano Zampini     if (spptr->matTranspose) {
37397e8381f9SStefano Zampini       CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat;
3740ad540459SPierre Jolivet       if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
37417e8381f9SStefano Zampini     }
37423fa6b06aSMark Adams   }
37439566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n]));
37449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
37457e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3746a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
37473fa6b06aSMark Adams   PetscFunctionReturn(0);
37483fa6b06aSMark Adams }
37493fa6b06aSMark Adams 
3750*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg)
3751*d71ae5a4SJacob Faibussowitsch {
3752a587d139SMark   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3753a587d139SMark 
3754a587d139SMark   PetscFunctionBegin;
37559a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
37569a14fc28SStefano Zampini     A->boundtocpu = flg;
37579a14fc28SStefano Zampini     PetscFunctionReturn(0);
37589a14fc28SStefano Zampini   }
3759a587d139SMark   if (flg) {
37609566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3761a587d139SMark 
376233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3763a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3764a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3765a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3766a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3767a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3768a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3769a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3770a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3771fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
37729566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps)));
37739566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL));
37749566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL));
37759566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL));
37769566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
37779566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
37789566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL));
3779a587d139SMark   } else {
378033c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3781a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3782a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3783a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3784a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3785a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3786a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3787a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3788a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3789fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
379067a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
379167a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
379267a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
379367a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
379467a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
379567a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
37967ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
37977ee59b9bSJunchao Zhang 
37989566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
37999566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
38009566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
38019566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE));
38029566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE));
38039566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE));
3804a587d139SMark   }
3805a587d139SMark   A->boundtocpu = flg;
3806ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3807ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3808ea500dcfSRichard Tran Mills   } else {
3809ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3810ea500dcfSRichard Tran Mills   }
3811a587d139SMark   PetscFunctionReturn(0);
3812a587d139SMark }
3813a587d139SMark 
3814*d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
3815*d71ae5a4SJacob Faibussowitsch {
381649735bf3SStefano Zampini   Mat B;
38179ae82921SPaul Mullowney 
38189ae82921SPaul Mullowney   PetscFunctionBegin;
38199566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
382049735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
38219566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
382249735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
38239566063dSJacob Faibussowitsch     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN));
382449735bf3SStefano Zampini   }
382549735bf3SStefano Zampini   B = *newmat;
382649735bf3SStefano Zampini 
38279566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
38289566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype));
382934136279SStefano Zampini 
383049735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
38319ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3832e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
38339566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
38349566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
38359566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
38361a2c6b5cSJunchao Zhang       spptr->format = MAT_CUSPARSE_CSR;
3837d8132acaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3838ba986b86SSatish Balay   #if CUSPARSE_VERSION > 11301
3839a435da06SStefano Zampini       spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3840a435da06SStefano Zampini   #else
3841d8132acaSStefano Zampini       spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
3842a435da06SStefano Zampini   #endif
3843d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3844d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3845d8132acaSStefano Zampini #endif
38461a2c6b5cSJunchao Zhang       B->spptr = spptr;
38479ae82921SPaul Mullowney     } else {
3848e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3849e6e9a74fSStefano Zampini 
38509566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
38519566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
38529566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream));
3853e6e9a74fSStefano Zampini       B->spptr = spptr;
38549ae82921SPaul Mullowney     }
3855e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
385649735bf3SStefano Zampini   }
3857693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
38589ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
38591a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
38609ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
386195639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3862693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
38632205254eSKarl Rupp 
38649566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE));
38659566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE));
38669566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3867ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
38689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
3869ae48a8d0SStefano Zampini #endif
38709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
38719ae82921SPaul Mullowney   PetscFunctionReturn(0);
38729ae82921SPaul Mullowney }
38739ae82921SPaul Mullowney 
3874*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
3875*d71ae5a4SJacob Faibussowitsch {
387602fe1965SBarry Smith   PetscFunctionBegin;
38779566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
38789566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B));
387902fe1965SBarry Smith   PetscFunctionReturn(0);
388002fe1965SBarry Smith }
388102fe1965SBarry Smith 
38823ca39a21SBarry Smith /*MC
3883e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3884e057df02SPaul Mullowney 
388511a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
388611a5261eSBarry Smith    CSR, ELL, or Hybrid format.
388711a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
3888e057df02SPaul Mullowney 
3889e057df02SPaul Mullowney    Options Database Keys:
389011a5261eSBarry Smith +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()`
389111a5261eSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
389211a5261eSBarry Smith -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for `MatMult()`) during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
389311a5261eSBarry Smith +  -mat_cusparse_use_cpu_solve - Do `MatSolve()` on CPU
3894e057df02SPaul Mullowney 
3895e057df02SPaul Mullowney   Level: beginner
3896e057df02SPaul Mullowney 
389711a5261eSBarry Smith .seealso: `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
3898e057df02SPaul Mullowney M*/
38997f756511SDominic Meiser 
3900bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat, MatFactorType, Mat *);
39010f39cd5aSBarry Smith 
3902*d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
3903*d71ae5a4SJacob Faibussowitsch {
390442c9c57cSBarry Smith   PetscFunctionBegin;
39059566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse_band));
39069566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse));
39079566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse));
39089566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse));
39099566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse));
3910bddcd29dSMark Adams 
391142c9c57cSBarry Smith   PetscFunctionReturn(0);
391242c9c57cSBarry Smith }
391329b38603SBarry Smith 
3914*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3915*d71ae5a4SJacob Faibussowitsch {
3916cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)mat->spptr;
3917cbc6b225SStefano Zampini 
3918cbc6b225SStefano Zampini   PetscFunctionBegin;
3919cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3920cbc6b225SStefano Zampini   delete cusp->cooPerm;
3921cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3922cbc6b225SStefano Zampini   cusp->cooPerm   = NULL;
3923cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3924cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
39259566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
39269566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3927cbc6b225SStefano Zampini   }
3928cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3929cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3930cbc6b225SStefano Zampini }
3931cbc6b225SStefano Zampini 
3932*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
3933*d71ae5a4SJacob Faibussowitsch {
39347f756511SDominic Meiser   PetscFunctionBegin;
39357f756511SDominic Meiser   if (*cusparsestruct) {
39369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat, (*cusparsestruct)->format));
39379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose, (*cusparsestruct)->format));
39387f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
393981902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
39407e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
39417e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3942a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
39439566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
39449566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
39459566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
39469566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
39477f756511SDominic Meiser   }
39487f756511SDominic Meiser   PetscFunctionReturn(0);
39497f756511SDominic Meiser }
39507f756511SDominic Meiser 
3951*d71ae5a4SJacob Faibussowitsch static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
3952*d71ae5a4SJacob Faibussowitsch {
39537f756511SDominic Meiser   PetscFunctionBegin;
39547f756511SDominic Meiser   if (*mat) {
39557f756511SDominic Meiser     delete (*mat)->values;
39567f756511SDominic Meiser     delete (*mat)->column_indices;
39577f756511SDominic Meiser     delete (*mat)->row_offsets;
39587f756511SDominic Meiser     delete *mat;
39597f756511SDominic Meiser     *mat = 0;
39607f756511SDominic Meiser   }
39617f756511SDominic Meiser   PetscFunctionReturn(0);
39627f756511SDominic Meiser }
39637f756511SDominic Meiser 
3964*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
3965*d71ae5a4SJacob Faibussowitsch {
39667f756511SDominic Meiser   PetscFunctionBegin;
39677f756511SDominic Meiser   if (*trifactor) {
39689566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
3969261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
39709566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
39719566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
39729566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3973afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
39749566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3975afb2bd1cSJunchao Zhang #endif
39769566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
39777f756511SDominic Meiser   }
39787f756511SDominic Meiser   PetscFunctionReturn(0);
39797f756511SDominic Meiser }
39807f756511SDominic Meiser 
3981*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format)
3982*d71ae5a4SJacob Faibussowitsch {
39837f756511SDominic Meiser   CsrMatrix *mat;
39847f756511SDominic Meiser 
39857f756511SDominic Meiser   PetscFunctionBegin;
39867f756511SDominic Meiser   if (*matstruct) {
39877f756511SDominic Meiser     if ((*matstruct)->mat) {
39887f756511SDominic Meiser       if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) {
3989afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
3990afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3991afb2bd1cSJunchao Zhang #else
39927f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
39939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3994afb2bd1cSJunchao Zhang #endif
39957f756511SDominic Meiser       } else {
39967f756511SDominic Meiser         mat = (CsrMatrix *)(*matstruct)->mat;
39977f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
39987f756511SDominic Meiser       }
39997f756511SDominic Meiser     }
40009566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
40017f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
40029566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
40039566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
40049566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4005afb2bd1cSJunchao Zhang 
4006afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
4007afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
40089566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4009afb2bd1cSJunchao Zhang     for (int i = 0; i < 3; i++) {
4010afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
40119566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
40129566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
40139566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4014afb2bd1cSJunchao Zhang       }
4015afb2bd1cSJunchao Zhang     }
4016afb2bd1cSJunchao Zhang #endif
40177f756511SDominic Meiser     delete *matstruct;
40187e8381f9SStefano Zampini     *matstruct = NULL;
40197f756511SDominic Meiser   }
40207f756511SDominic Meiser   PetscFunctionReturn(0);
40217f756511SDominic Meiser }
40227f756511SDominic Meiser 
4023*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors)
4024*d71ae5a4SJacob Faibussowitsch {
4025da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4026da112707SJunchao Zhang 
40277f756511SDominic Meiser   PetscFunctionBegin;
4028da112707SJunchao Zhang   if (fs) {
4029da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4030da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4031da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4032da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4033da112707SJunchao Zhang     delete fs->rpermIndices;
4034da112707SJunchao Zhang     delete fs->cpermIndices;
4035da112707SJunchao Zhang     delete fs->workVector;
4036da112707SJunchao Zhang     fs->rpermIndices = NULL;
4037da112707SJunchao Zhang     fs->cpermIndices = NULL;
4038da112707SJunchao Zhang     fs->workVector   = NULL;
4039da112707SJunchao Zhang     if (fs->a_band_d) PetscCallCUDA(cudaFree(fs->a_band_d));
4040da112707SJunchao Zhang     if (fs->i_band_d) PetscCallCUDA(cudaFree(fs->i_band_d));
4041da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4042da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
4043da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4044da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
4045da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4046da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4047da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
404812ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4049da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4050da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
405112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4052da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4053da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4054da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4055da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4056da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4057da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4058da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4059da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4060da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4061da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4062da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4063da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
406412ba2bc6SJunchao Zhang 
406512ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
406612ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4067da112707SJunchao Zhang #endif
4068ccdfe979SStefano Zampini   }
4069ccdfe979SStefano Zampini   PetscFunctionReturn(0);
4070ccdfe979SStefano Zampini }
4071ccdfe979SStefano Zampini 
4072*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors)
4073*d71ae5a4SJacob Faibussowitsch {
4074ccdfe979SStefano Zampini   cusparseHandle_t handle;
4075ccdfe979SStefano Zampini 
4076ccdfe979SStefano Zampini   PetscFunctionBegin;
4077ccdfe979SStefano Zampini   if (*trifactors) {
40789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
407948a46eb9SPierre Jolivet     if (handle = (*trifactors)->handle) PetscCallCUSPARSE(cusparseDestroy(handle));
40809566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
40817f756511SDominic Meiser   }
40827f756511SDominic Meiser   PetscFunctionReturn(0);
40837f756511SDominic Meiser }
40847e8381f9SStefano Zampini 
40859371c9d4SSatish Balay struct IJCompare {
4086*d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4087*d71ae5a4SJacob Faibussowitsch   {
40887e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
40897e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
40907e8381f9SStefano Zampini     return false;
40917e8381f9SStefano Zampini   }
40927e8381f9SStefano Zampini };
40937e8381f9SStefano Zampini 
40949371c9d4SSatish Balay struct IJEqual {
4095*d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
4096*d71ae5a4SJacob Faibussowitsch   {
40977e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
40987e8381f9SStefano Zampini     return true;
40997e8381f9SStefano Zampini   }
41007e8381f9SStefano Zampini };
41017e8381f9SStefano Zampini 
41029371c9d4SSatish Balay struct IJDiff {
41039371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 == t2 ? 0 : 1; }
41047e8381f9SStefano Zampini };
41057e8381f9SStefano Zampini 
41069371c9d4SSatish Balay struct IJSum {
41079371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &t1, const PetscInt &t2) { return t1 || t2; }
41087e8381f9SStefano Zampini };
41097e8381f9SStefano Zampini 
41107e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
4111219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4112*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
4113*d71ae5a4SJacob Faibussowitsch {
41147e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                   *cusp      = (Mat_SeqAIJCUSPARSE *)A->spptr;
4115fcdce8c4SStefano Zampini   Mat_SeqAIJ                           *a         = (Mat_SeqAIJ *)A->data;
4116bfcc3627SStefano Zampini   THRUSTARRAY                          *cooPerm_v = NULL;
411708391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
41187e8381f9SStefano Zampini   CsrMatrix                            *matrix;
41197e8381f9SStefano Zampini   PetscInt                              n;
41207e8381f9SStefano Zampini 
41217e8381f9SStefano Zampini   PetscFunctionBegin;
412228b400f6SJacob Faibussowitsch   PetscCheck(cusp, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE struct");
412328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUSPARSE CsrMatrix");
41247e8381f9SStefano Zampini   if (!cusp->cooPerm) {
41259566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
41269566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
41277e8381f9SStefano Zampini     PetscFunctionReturn(0);
41287e8381f9SStefano Zampini   }
41297e8381f9SStefano Zampini   matrix = (CsrMatrix *)cusp->mat->mat;
413028b400f6SJacob Faibussowitsch   PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4131e61fc153SStefano Zampini   if (!v) {
4132e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.);
4133e61fc153SStefano Zampini     goto finalize;
41347e8381f9SStefano Zampini   }
4135e61fc153SStefano Zampini   n = cusp->cooPerm->size();
413608391a17SStefano Zampini   if (isCudaMem(v)) {
413708391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
413808391a17SStefano Zampini   } else {
4139e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4140e61fc153SStefano Zampini     cooPerm_v->assign(v, v + n);
414108391a17SStefano Zampini     d_v = cooPerm_v->data();
41429566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
414308391a17SStefano Zampini   }
41449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4145e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4146ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) {   /* there are repeated entries in d_v[], and we need to add these them */
4147bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
414808391a17SStefano Zampini       auto         vbit      = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4149ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4150ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4151ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4152ddea5d60SJunchao Zhang       */
4153e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), cooPerm_w->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
4154e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(), cooPerm_w->end(), matrix->values->begin(), matrix->values->begin(), thrust::plus<PetscScalar>());
4155e61fc153SStefano Zampini       delete cooPerm_w;
41567e8381f9SStefano Zampini     } else {
4157ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
41589371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
41599371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
4160ddea5d60SJunchao Zhang       thrust::for_each(zibit, zieit, VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
41617e8381f9SStefano Zampini     }
41627e8381f9SStefano Zampini   } else {
4163e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
416408391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin());
4165e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), vbit, thrust::make_discard_iterator(), matrix->values->begin(), thrust::equal_to<PetscInt>(), thrust::plus<PetscScalar>());
41667e8381f9SStefano Zampini     } else {
41679371c9d4SSatish Balay       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->begin()), matrix->values->begin()));
41689371c9d4SSatish Balay       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->cooPerm->end()), matrix->values->end()));
41697e8381f9SStefano Zampini       thrust::for_each(zibit, zieit, VecCUDAEquals());
41707e8381f9SStefano Zampini     }
41717e8381f9SStefano Zampini   }
41729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4173e61fc153SStefano Zampini finalize:
4174e61fc153SStefano Zampini   delete cooPerm_v;
41757e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
41769566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4177fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
41789566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", A->rmap->n, A->cmap->n, a->nz));
41799566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Number of mallocs during MatSetValues() is 0\n"));
41809566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", a->rmax));
4181fcdce8c4SStefano Zampini   a->reallocs = 0;
4182fcdce8c4SStefano Zampini   A->info.mallocs += 0;
4183fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4184fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4185fcdce8c4SStefano Zampini   A->num_ass++;
41867e8381f9SStefano Zampini   PetscFunctionReturn(0);
41877e8381f9SStefano Zampini }
41887e8381f9SStefano Zampini 
4189*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4190*d71ae5a4SJacob Faibussowitsch {
4191a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4192a49f1ed0SStefano Zampini 
4193a49f1ed0SStefano Zampini   PetscFunctionBegin;
4194a49f1ed0SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4195a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4196a49f1ed0SStefano Zampini   if (destroy) {
41979566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format));
4198a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4199a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4200a49f1ed0SStefano Zampini   }
42011a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
4202a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
4203a49f1ed0SStefano Zampini }
4204a49f1ed0SStefano Zampini 
42057e8381f9SStefano Zampini #include <thrust/binary_search.h>
4206219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4207*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
4208*d71ae5a4SJacob Faibussowitsch {
42097e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
42107e8381f9SStefano Zampini   Mat_SeqAIJ         *a    = (Mat_SeqAIJ *)A->data;
42117e8381f9SStefano Zampini   PetscInt            cooPerm_n, nzr = 0;
42127e8381f9SStefano Zampini 
42137e8381f9SStefano Zampini   PetscFunctionBegin;
42149566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
42159566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
42167e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
42177e8381f9SStefano Zampini   if (n != cooPerm_n) {
42187e8381f9SStefano Zampini     delete cusp->cooPerm;
42197e8381f9SStefano Zampini     delete cusp->cooPerm_a;
42207e8381f9SStefano Zampini     cusp->cooPerm   = NULL;
42217e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
42227e8381f9SStefano Zampini   }
42237e8381f9SStefano Zampini   if (n) {
4224e8729f6fSJunchao Zhang     thrust::device_ptr<PetscInt> d_i, d_j;
4225e8729f6fSJunchao Zhang     PetscInt                    *d_raw_i, *d_raw_j;
4226e8729f6fSJunchao Zhang     PetscBool                    free_raw_i = PETSC_FALSE, free_raw_j = PETSC_FALSE;
4227e8729f6fSJunchao Zhang     PetscMemType                 imtype, jmtype;
4228e8729f6fSJunchao Zhang 
4229e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_i, &imtype));
4230e8729f6fSJunchao Zhang     if (PetscMemTypeHost(imtype)) {
4231e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_i, sizeof(PetscInt) * n));
4232e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_i, coo_i, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4233e8729f6fSJunchao Zhang       d_i        = thrust::device_pointer_cast(d_raw_i);
4234e8729f6fSJunchao Zhang       free_raw_i = PETSC_TRUE;
4235e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4236e8729f6fSJunchao Zhang     } else {
4237e8729f6fSJunchao Zhang       d_i = thrust::device_pointer_cast(coo_i);
4238e8729f6fSJunchao Zhang     }
4239e8729f6fSJunchao Zhang 
4240e8729f6fSJunchao Zhang     PetscCall(PetscGetMemType(coo_j, &jmtype));
4241e8729f6fSJunchao Zhang     if (PetscMemTypeHost(jmtype)) { // MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic() passes device coo_i[] and host coo_j[]!
4242e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMalloc(&d_raw_j, sizeof(PetscInt) * n));
4243e8729f6fSJunchao Zhang       PetscCallCUDA(cudaMemcpy(d_raw_j, coo_j, sizeof(PetscInt) * n, cudaMemcpyHostToDevice));
4244e8729f6fSJunchao Zhang       d_j        = thrust::device_pointer_cast(d_raw_j);
4245e8729f6fSJunchao Zhang       free_raw_j = PETSC_TRUE;
4246e8729f6fSJunchao Zhang       PetscCall(PetscLogCpuToGpu(1. * n * sizeof(PetscInt)));
4247e8729f6fSJunchao Zhang     } else {
4248e8729f6fSJunchao Zhang       d_j = thrust::device_pointer_cast(coo_j);
4249e8729f6fSJunchao Zhang     }
4250e8729f6fSJunchao Zhang 
42517e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
42527e8381f9SStefano Zampini 
4253ad540459SPierre Jolivet     if (!cusp->cooPerm) cusp->cooPerm = new THRUSTINTARRAY(n);
4254ad540459SPierre Jolivet     if (!cusp->cooPerm_a) cusp->cooPerm_a = new THRUSTINTARRAY(n);
42557e8381f9SStefano Zampini 
4256ddea5d60SJunchao Zhang     /* Ex.
4257ddea5d60SJunchao Zhang       n = 6
4258ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4259ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4260ddea5d60SJunchao Zhang     */
4261e8729f6fSJunchao Zhang     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i, d_j));
4262e8729f6fSJunchao Zhang     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i + n, d_j + n));
42637e8381f9SStefano Zampini 
42649566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
42657e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4266ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4267e8729f6fSJunchao Zhang     (*cusp->cooPerm_a).assign(d_i, d_i + n);                              /* copy the sorted array */
4268e8729f6fSJunchao Zhang     THRUSTINTARRAY w(d_j, d_j + n);
42697e8381f9SStefano Zampini 
4270ddea5d60SJunchao Zhang     /*
4271ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4272ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4273ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4274ddea5d60SJunchao Zhang     */
4275ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4276ddea5d60SJunchao Zhang 
4277ddea5d60SJunchao Zhang     /*
4278ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4279ddea5d60SJunchao Zhang                             ^ekey
4280ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4281ddea5d60SJunchao Zhang                            ^nekye
4282ddea5d60SJunchao Zhang     */
42837e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
42847e8381f9SStefano Zampini       delete cusp->cooPerm_a;
42857e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4286ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4287ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4288ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4289ddea5d60SJunchao Zhang       adjacent_difference(w.begin(), w.end(), w.begin(), IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4290ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0;                                                                                 /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
42917e8381f9SStefano Zampini       w[0]                  = 0;
4292ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), w.begin(), cusp->cooPerm_a->begin(), IJSum());            /* cooPerm_a =          [0,0,1,1,1,1]*/
4293ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(), cusp->cooPerm_a->end(), cusp->cooPerm_a->begin(), thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
42947e8381f9SStefano Zampini     }
42957e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4296e8729f6fSJunchao Zhang     thrust::upper_bound(d_i, nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4297ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4298ddea5d60SJunchao Zhang                         ii.begin());                              /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
42999566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
43007e8381f9SStefano Zampini 
43019566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A, &a->a, &a->j, &a->i));
43027e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
43037e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
43047e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
43059566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n + 1, &a->i));
4306ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
43079566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i + 1, ii.data().get(), A->rmap->n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43087e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4309fcdce8c4SStefano Zampini     a->rmax          = 0;
43109566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->a));
43119566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz, &a->j));
4312e8729f6fSJunchao Zhang     PetscCallCUDA(cudaMemcpy(a->j, thrust::raw_pointer_cast(d_j), a->nz * sizeof(PetscInt), cudaMemcpyDeviceToHost));
43139566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n, &a->ilen));
43149566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n, &a->imax));
43157e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
43167e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i + 1] - a->i[i];
43177e8381f9SStefano Zampini       nzr += (PetscInt) !!(nnzr);
43187e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4319fcdce8c4SStefano Zampini       a->rmax                 = PetscMax(a->rmax, nnzr);
43207e8381f9SStefano Zampini     }
4321fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
43227e8381f9SStefano Zampini     A->preallocated  = PETSC_TRUE;
43239566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n + a->nz) * sizeof(PetscInt)));
43249566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
4325e8729f6fSJunchao Zhang     if (free_raw_i) PetscCallCUDA(cudaFree(d_raw_i));
4326e8729f6fSJunchao Zhang     if (free_raw_j) PetscCallCUDA(cudaFree(d_raw_j));
43277e8381f9SStefano Zampini   } else {
43289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A, 0, NULL));
43297e8381f9SStefano Zampini   }
43309566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE));
43317e8381f9SStefano Zampini 
43327e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4333e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
43349566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a, a->nz));
43359566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A, nzr, &a->compressedrow, a->i, A->rmap->n, 0.6));
43367e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
43379566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
43389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE));
43397e8381f9SStefano Zampini   PetscFunctionReturn(0);
43407e8381f9SStefano Zampini }
4341ed502f03SStefano Zampini 
4342*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
4343*d71ae5a4SJacob Faibussowitsch {
4344219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4345219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4346cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
4347219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
4348219fbbafSJunchao Zhang 
4349219fbbafSJunchao Zhang   PetscFunctionBegin;
43509566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
43519566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4352219fbbafSJunchao Zhang   if (coo_i) {
43539566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
4354219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4355219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) {
43569371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_j[k] < 0) {
43579371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
43589371c9d4SSatish Balay           break;
43599371c9d4SSatish Balay         }
4360219fbbafSJunchao Zhang       }
4361219fbbafSJunchao Zhang     }
4362219fbbafSJunchao Zhang   }
4363219fbbafSJunchao Zhang 
4364219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
43659566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
4366219fbbafSJunchao Zhang   } else {
43679566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
4368cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
43699566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4370219fbbafSJunchao Zhang     seq = static_cast<Mat_SeqAIJ *>(mat->data);
4371219fbbafSJunchao Zhang     dev = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr);
43729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->jmap_d, (seq->nz + 1) * sizeof(PetscCount)));
43739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d, seq->jmap, (seq->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
43749566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&dev->perm_d, seq->Atot * sizeof(PetscCount)));
43759566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d, seq->perm, seq->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice));
4376219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4377219fbbafSJunchao Zhang   }
4378219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4379219fbbafSJunchao Zhang }
4380219fbbafSJunchao Zhang 
4381*d71ae5a4SJacob Faibussowitsch __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[])
4382*d71ae5a4SJacob Faibussowitsch {
4383219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
4384219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
4385b6c38306SJunchao Zhang   for (; i < nnz; i += grid_size) {
4386b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4387b6c38306SJunchao Zhang     for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]];
4388b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum;
4389b6c38306SJunchao Zhang   }
4390219fbbafSJunchao Zhang }
4391219fbbafSJunchao Zhang 
4392*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4393*d71ae5a4SJacob Faibussowitsch {
4394219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq  = (Mat_SeqAIJ *)A->data;
4395219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev  = (Mat_SeqAIJCUSPARSE *)A->spptr;
4396219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4397219fbbafSJunchao Zhang   PetscMemType        memtype;
4398219fbbafSJunchao Zhang   const PetscScalar  *v1 = v;
4399219fbbafSJunchao Zhang   PetscScalar        *Aa;
4400219fbbafSJunchao Zhang 
4401219fbbafSJunchao Zhang   PetscFunctionBegin;
4402219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
44039566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
4404219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
44059566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, seq->coo_n * sizeof(PetscScalar)));
44069566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, seq->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
4407219fbbafSJunchao Zhang     }
4408219fbbafSJunchao Zhang 
44099566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa));
44109566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa));
4411219fbbafSJunchao Zhang 
4412cbc6b225SStefano Zampini     if (Annz) {
4413b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz + 255) / 256, 256>>>(v1, Annz, dev->jmap_d, dev->perm_d, imode, Aa);
44149566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4415cbc6b225SStefano Zampini     }
4416219fbbafSJunchao Zhang 
44179566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
44189566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
4419219fbbafSJunchao Zhang 
44209566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
4421219fbbafSJunchao Zhang   } else {
44229566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A, v, imode));
4423219fbbafSJunchao Zhang   }
4424219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4425219fbbafSJunchao Zhang }
4426219fbbafSJunchao Zhang 
44275b7e41feSStefano Zampini /*@C
442811a5261eSBarry Smith     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for `MATSEQAIJCUSPARSE` matrices.
44295b7e41feSStefano Zampini 
44305b7e41feSStefano Zampini    Not collective
44315b7e41feSStefano Zampini 
44325b7e41feSStefano Zampini     Input Parameters:
44335b7e41feSStefano Zampini +   A - the matrix
443411a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44355b7e41feSStefano Zampini 
44365b7e41feSStefano Zampini     Output Parameters:
44375b7e41feSStefano Zampini +   ia - the CSR row pointers
44385b7e41feSStefano Zampini -   ja - the CSR column indices
44395b7e41feSStefano Zampini 
44405b7e41feSStefano Zampini     Level: developer
44415b7e41feSStefano Zampini 
444211a5261eSBarry Smith     Note:
44435b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
44445b7e41feSStefano Zampini 
4445db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
44465b7e41feSStefano Zampini @*/
4447*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4448*d71ae5a4SJacob Faibussowitsch {
44495f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
44505f101d05SStefano Zampini   CsrMatrix          *csr;
44515f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ *)A->data;
44525f101d05SStefano Zampini 
44535f101d05SStefano Zampini   PetscFunctionBegin;
44545f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44555f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
44565f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4457aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
44589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
445928b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
44605f101d05SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
44615f101d05SStefano Zampini   if (i) {
44625f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
44635f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
44645f101d05SStefano Zampini         cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
44655f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
44669566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
44675f101d05SStefano Zampini       }
44685f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
44695f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
44705f101d05SStefano Zampini   }
44715f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
44725f101d05SStefano Zampini   PetscFunctionReturn(0);
44735f101d05SStefano Zampini }
44745f101d05SStefano Zampini 
44755b7e41feSStefano Zampini /*@C
447611a5261eSBarry Smith     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with `MatSeqAIJCUSPARSEGetIJ()`
44775b7e41feSStefano Zampini 
44785b7e41feSStefano Zampini    Not collective
44795b7e41feSStefano Zampini 
44805b7e41feSStefano Zampini     Input Parameters:
44815b7e41feSStefano Zampini +   A - the matrix
448211a5261eSBarry Smith -   compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form
44835b7e41feSStefano Zampini 
44845b7e41feSStefano Zampini     Output Parameters:
44855b7e41feSStefano Zampini +   ia - the CSR row pointers
44865b7e41feSStefano Zampini -   ja - the CSR column indices
44875b7e41feSStefano Zampini 
44885b7e41feSStefano Zampini     Level: developer
44895b7e41feSStefano Zampini 
4490db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
44915b7e41feSStefano Zampini @*/
4492*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j)
4493*d71ae5a4SJacob Faibussowitsch {
44945f101d05SStefano Zampini   PetscFunctionBegin;
44955f101d05SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
44965f101d05SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
44975f101d05SStefano Zampini   if (i) *i = NULL;
44985f101d05SStefano Zampini   if (j) *j = NULL;
44995f101d05SStefano Zampini   PetscFunctionReturn(0);
45005f101d05SStefano Zampini }
45015f101d05SStefano Zampini 
45025b7e41feSStefano Zampini /*@C
450311a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45045b7e41feSStefano Zampini 
45055b7e41feSStefano Zampini    Not Collective
45065b7e41feSStefano Zampini 
45075b7e41feSStefano Zampini    Input Parameter:
450811a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45095b7e41feSStefano Zampini 
45105b7e41feSStefano Zampini    Output Parameter:
45115b7e41feSStefano Zampini .   a - pointer to the device data
45125b7e41feSStefano Zampini 
45135b7e41feSStefano Zampini    Level: developer
45145b7e41feSStefano Zampini 
451511a5261eSBarry Smith    Note:
451611a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
45175b7e41feSStefano Zampini 
4518db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
45195b7e41feSStefano Zampini @*/
4520*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a)
4521*d71ae5a4SJacob Faibussowitsch {
4522ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4523ed502f03SStefano Zampini   CsrMatrix          *csr;
4524ed502f03SStefano Zampini 
4525ed502f03SStefano Zampini   PetscFunctionBegin;
4526ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4527ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4528ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4529aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
453128b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4532ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
453328b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4534ed502f03SStefano Zampini   *a = csr->values->data().get();
4535ed502f03SStefano Zampini   PetscFunctionReturn(0);
4536ed502f03SStefano Zampini }
4537ed502f03SStefano Zampini 
45385b7e41feSStefano Zampini /*@C
453911a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()`
45405b7e41feSStefano Zampini 
45415b7e41feSStefano Zampini    Not Collective
45425b7e41feSStefano Zampini 
45435b7e41feSStefano Zampini    Input Parameter:
454411a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45455b7e41feSStefano Zampini 
45465b7e41feSStefano Zampini    Output Parameter:
45475b7e41feSStefano Zampini .   a - pointer to the device data
45485b7e41feSStefano Zampini 
45495b7e41feSStefano Zampini    Level: developer
45505b7e41feSStefano Zampini 
4551db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
45525b7e41feSStefano Zampini @*/
4553*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a)
4554*d71ae5a4SJacob Faibussowitsch {
4555ed502f03SStefano Zampini   PetscFunctionBegin;
4556ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4557ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4558ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4559ed502f03SStefano Zampini   *a = NULL;
4560ed502f03SStefano Zampini   PetscFunctionReturn(0);
4561ed502f03SStefano Zampini }
4562ed502f03SStefano Zampini 
45635b7e41feSStefano Zampini /*@C
456411a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
45655b7e41feSStefano Zampini 
45665b7e41feSStefano Zampini    Not Collective
45675b7e41feSStefano Zampini 
45685b7e41feSStefano Zampini    Input Parameter:
456911a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
45705b7e41feSStefano Zampini 
45715b7e41feSStefano Zampini    Output Parameter:
45725b7e41feSStefano Zampini .   a - pointer to the device data
45735b7e41feSStefano Zampini 
45745b7e41feSStefano Zampini    Level: developer
45755b7e41feSStefano Zampini 
457611a5261eSBarry Smith    Note:
457711a5261eSBarry Smith    May trigger host-device copies if up-to-date matrix data is on host
45785b7e41feSStefano Zampini 
4579db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
45805b7e41feSStefano Zampini @*/
4581*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a)
4582*d71ae5a4SJacob Faibussowitsch {
4583039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4584039c6fbaSStefano Zampini   CsrMatrix          *csr;
4585039c6fbaSStefano Zampini 
4586039c6fbaSStefano Zampini   PetscFunctionBegin;
4587039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4588039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4589039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4590aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
45919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
459228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4593039c6fbaSStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
459428b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4595039c6fbaSStefano Zampini   *a             = csr->values->data().get();
4596039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
45979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4598039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4599039c6fbaSStefano Zampini }
46005b7e41feSStefano Zampini /*@C
460111a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()`
4602039c6fbaSStefano Zampini 
46035b7e41feSStefano Zampini    Not Collective
46045b7e41feSStefano Zampini 
46055b7e41feSStefano Zampini    Input Parameter:
460611a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
46075b7e41feSStefano Zampini 
46085b7e41feSStefano Zampini    Output Parameter:
46095b7e41feSStefano Zampini .   a - pointer to the device data
46105b7e41feSStefano Zampini 
46115b7e41feSStefano Zampini    Level: developer
46125b7e41feSStefano Zampini 
4613db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
46145b7e41feSStefano Zampini @*/
4615*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a)
4616*d71ae5a4SJacob Faibussowitsch {
4617039c6fbaSStefano Zampini   PetscFunctionBegin;
4618039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4619039c6fbaSStefano Zampini   PetscValidPointer(a, 2);
4620039c6fbaSStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46229566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4623039c6fbaSStefano Zampini   *a = NULL;
4624039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4625039c6fbaSStefano Zampini }
4626039c6fbaSStefano Zampini 
46275b7e41feSStefano Zampini /*@C
462811a5261eSBarry Smith    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored
46295b7e41feSStefano Zampini 
46305b7e41feSStefano Zampini    Not Collective
46315b7e41feSStefano Zampini 
46325b7e41feSStefano Zampini    Input Parameter:
463311a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
46345b7e41feSStefano Zampini 
46355b7e41feSStefano Zampini    Output Parameter:
46365b7e41feSStefano Zampini .   a - pointer to the device data
46375b7e41feSStefano Zampini 
46385b7e41feSStefano Zampini    Level: developer
46395b7e41feSStefano Zampini 
464011a5261eSBarry Smith    Note:
464111a5261eSBarry Smith    Does not trigger host-device copies and flags data validity on the GPU
46425b7e41feSStefano Zampini 
4643db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
46445b7e41feSStefano Zampini @*/
4645*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a)
4646*d71ae5a4SJacob Faibussowitsch {
4647ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr;
4648ed502f03SStefano Zampini   CsrMatrix          *csr;
4649ed502f03SStefano Zampini 
4650ed502f03SStefano Zampini   PetscFunctionBegin;
4651ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4652ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4653ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4654aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
465528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4656ed502f03SStefano Zampini   csr = (CsrMatrix *)cusp->mat->mat;
465728b400f6SJacob Faibussowitsch   PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory");
4658ed502f03SStefano Zampini   *a             = csr->values->data().get();
4659039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE));
4661ed502f03SStefano Zampini   PetscFunctionReturn(0);
4662ed502f03SStefano Zampini }
4663ed502f03SStefano Zampini 
46645b7e41feSStefano Zampini /*@C
466511a5261eSBarry Smith    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()`
46665b7e41feSStefano Zampini 
46675b7e41feSStefano Zampini    Not Collective
46685b7e41feSStefano Zampini 
46695b7e41feSStefano Zampini    Input Parameter:
467011a5261eSBarry Smith .   A - a `MATSEQAIJCUSPARSE` matrix
46715b7e41feSStefano Zampini 
46725b7e41feSStefano Zampini    Output Parameter:
46735b7e41feSStefano Zampini .   a - pointer to the device data
46745b7e41feSStefano Zampini 
46755b7e41feSStefano Zampini    Level: developer
46765b7e41feSStefano Zampini 
4677db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
46785b7e41feSStefano Zampini @*/
4679*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a)
4680*d71ae5a4SJacob Faibussowitsch {
4681ed502f03SStefano Zampini   PetscFunctionBegin;
4682ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4683ed502f03SStefano Zampini   PetscValidPointer(a, 2);
4684ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
46859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
46869566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4687ed502f03SStefano Zampini   *a = NULL;
4688ed502f03SStefano Zampini   PetscFunctionReturn(0);
4689ed502f03SStefano Zampini }
4690ed502f03SStefano Zampini 
46919371c9d4SSatish Balay struct IJCompare4 {
4692*d71ae5a4SJacob Faibussowitsch   __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4693*d71ae5a4SJacob Faibussowitsch   {
4694ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4695ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4696ed502f03SStefano Zampini     return false;
4697ed502f03SStefano Zampini   }
4698ed502f03SStefano Zampini };
4699ed502f03SStefano Zampini 
47009371c9d4SSatish Balay struct Shift {
4701ed502f03SStefano Zampini   int _shift;
4702ed502f03SStefano Zampini 
4703ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) { }
47049371c9d4SSatish Balay   __host__ __device__ inline int operator()(const int &c) { return c + _shift; }
4705ed502f03SStefano Zampini };
4706ed502f03SStefano Zampini 
4707ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4708*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
4709*d71ae5a4SJacob Faibussowitsch {
4710ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c;
4711ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp;
4712ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4713ed502f03SStefano Zampini   CsrMatrix                    *Acsr, *Bcsr, *Ccsr;
4714ed502f03SStefano Zampini   PetscInt                      Annz, Bnnz;
4715ed502f03SStefano Zampini   cusparseStatus_t              stat;
4716ed502f03SStefano Zampini   PetscInt                      i, m, n, zero = 0;
4717ed502f03SStefano Zampini 
4718ed502f03SStefano Zampini   PetscFunctionBegin;
4719ed502f03SStefano Zampini   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
4720ed502f03SStefano Zampini   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
4721ed502f03SStefano Zampini   PetscValidPointer(C, 4);
4722ed502f03SStefano Zampini   PetscCheckTypeName(A, MATSEQAIJCUSPARSE);
4723ed502f03SStefano Zampini   PetscCheckTypeName(B, MATSEQAIJCUSPARSE);
47245f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
472508401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
4726aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4727aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
4728ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4729ed502f03SStefano Zampini     m = A->rmap->n;
4730ed502f03SStefano Zampini     n = A->cmap->n + B->cmap->n;
47319566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, C));
47329566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C, m, n, m, n));
47339566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE));
4734ed502f03SStefano Zampini     c                       = (Mat_SeqAIJ *)(*C)->data;
4735ed502f03SStefano Zampini     Ccusp                   = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
4736ed502f03SStefano Zampini     Cmat                    = new Mat_SeqAIJCUSPARSEMultStruct;
4737ed502f03SStefano Zampini     Ccsr                    = new CsrMatrix;
4738ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4739ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4740ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4741ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4742ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4743ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4744ed502f03SStefano Zampini     Ccusp->nrows            = m;
4745ed502f03SStefano Zampini     Ccusp->mat              = Cmat;
4746ed502f03SStefano Zampini     Ccusp->mat->mat         = Ccsr;
4747ed502f03SStefano Zampini     Ccsr->num_rows          = m;
4748ed502f03SStefano Zampini     Ccsr->num_cols          = n;
47499566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
47509566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
47519566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
47529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one), sizeof(PetscScalar)));
47539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero), sizeof(PetscScalar)));
47549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
47559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
47589566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
476028b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
476128b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4762ed502f03SStefano Zampini 
4763ed502f03SStefano Zampini     Acsr                 = (CsrMatrix *)Acusp->mat->mat;
4764ed502f03SStefano Zampini     Bcsr                 = (CsrMatrix *)Bcusp->mat->mat;
4765ed502f03SStefano Zampini     Annz                 = (PetscInt)Acsr->column_indices->size();
4766ed502f03SStefano Zampini     Bnnz                 = (PetscInt)Bcsr->column_indices->size();
4767ed502f03SStefano Zampini     c->nz                = Annz + Bnnz;
4768ed502f03SStefano Zampini     Ccsr->row_offsets    = new THRUSTINTARRAY32(m + 1);
4769ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4770ed502f03SStefano Zampini     Ccsr->values         = new THRUSTARRAY(c->nz);
4771ed502f03SStefano Zampini     Ccsr->num_entries    = c->nz;
4772ed502f03SStefano Zampini     Ccusp->cooPerm       = new THRUSTINTARRAY(c->nz);
4773ed502f03SStefano Zampini     if (c->nz) {
47742ed87e7eSStefano Zampini       auto              Acoo = new THRUSTINTARRAY32(Annz);
47752ed87e7eSStefano Zampini       auto              Bcoo = new THRUSTINTARRAY32(Bnnz);
47762ed87e7eSStefano Zampini       auto              Ccoo = new THRUSTINTARRAY32(c->nz);
47772ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff, *Broff;
47782ed87e7eSStefano Zampini 
4779ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4780ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4781ed502f03SStefano Zampini           Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
4782ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1);
47839566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt)));
4784ed502f03SStefano Zampini         }
47852ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
47862ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4787ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4788ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4789ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1);
4790ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1);
47919566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt)));
4792ed502f03SStefano Zampini         }
47932ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
47942ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
47959566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
47969371c9d4SSatish Balay       stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47979371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
47989371c9d4SSatish Balay       stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO);
47999371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48002ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
48012ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
48022ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
48038909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0)
4804ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n));
4805ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n));
48068909a122SStefano Zampini #else
48078909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
48088909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
48098909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
48108909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n));
48118909a122SStefano Zampini #endif
48122ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz);
48132ed87e7eSStefano Zampini       auto Azb   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm));
48142ed87e7eSStefano Zampini       auto Aze   = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm));
48152ed87e7eSStefano Zampini       auto Bzb   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm));
48162ed87e7eSStefano Zampini       auto Bze   = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm));
48172ed87e7eSStefano Zampini       auto Czb   = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin()));
4818ed502f03SStefano Zampini       auto p1    = Ccusp->cooPerm->begin();
4819ed502f03SStefano Zampini       auto p2    = Ccusp->cooPerm->begin();
4820ed502f03SStefano Zampini       thrust::advance(p2, Annz);
4821792fecdfSBarry Smith       PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4()));
48228909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0)
48238909a122SStefano Zampini       thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n));
48248909a122SStefano Zampini #endif
48252ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
48262ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
48272ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
4828792fecdfSBarry Smith       PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
48292ed87e7eSStefano Zampini #else
48302ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
4831792fecdfSBarry Smith       PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred));
4832792fecdfSBarry Smith       PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred));
48332ed87e7eSStefano Zampini #endif
48349371c9d4SSatish Balay       stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO);
48359371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
48369566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
48372ed87e7eSStefano Zampini       delete wPerm;
48382ed87e7eSStefano Zampini       delete Acoo;
48392ed87e7eSStefano Zampini       delete Bcoo;
48402ed87e7eSStefano Zampini       delete Ccoo;
4841ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48429371c9d4SSatish Balay       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48439371c9d4SSatish Balay       PetscCallCUSPARSE(stat);
4844ed502f03SStefano Zampini #endif
48451a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
48469566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
48479566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4848ed502f03SStefano Zampini         PetscBool                     AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4849ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4850ed502f03SStefano Zampini         CsrMatrix                    *CcsrT = new CsrMatrix;
4851ed502f03SStefano Zampini         CsrMatrix                    *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4852ed502f03SStefano Zampini         CsrMatrix                    *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4853ed502f03SStefano Zampini 
48541a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
48551a2c6b5cSJunchao Zhang         (*C)->transupdated            = PETSC_TRUE;
4856a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu         = NULL;
4857ed502f03SStefano Zampini         CmatT->cprowIndices           = NULL;
4858ed502f03SStefano Zampini         CmatT->mat                    = CcsrT;
4859ed502f03SStefano Zampini         CcsrT->num_rows               = n;
4860ed502f03SStefano Zampini         CcsrT->num_cols               = m;
4861ed502f03SStefano Zampini         CcsrT->num_entries            = c->nz;
4862ed502f03SStefano Zampini 
4863ed502f03SStefano Zampini         CcsrT->row_offsets    = new THRUSTINTARRAY32(n + 1);
4864ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4865ed502f03SStefano Zampini         CcsrT->values         = new THRUSTARRAY(c->nz);
4866ed502f03SStefano Zampini 
48679566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4868ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4869ed502f03SStefano Zampini         if (AT) {
4870ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT);
4871ed502f03SStefano Zampini           thrust::advance(rT, -1);
4872ed502f03SStefano Zampini         }
4873ed502f03SStefano Zampini         if (BT) {
4874ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz));
4875ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz));
4876ed502f03SStefano Zampini           thrust::copy(titb, tite, rT);
4877ed502f03SStefano Zampini         }
4878ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4879ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT);
4880ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT);
4881ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4882ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4883ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
48849566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4885ed502f03SStefano Zampini 
48869566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
48879566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
48889566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
48899566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one), sizeof(PetscScalar)));
48909566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero), sizeof(PetscScalar)));
48919566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
48929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48939566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice));
48949566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice));
4895ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0)
48969371c9d4SSatish Balay         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);
48979371c9d4SSatish Balay         PetscCallCUSPARSE(stat);
4898ed502f03SStefano Zampini #endif
4899ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4900ed502f03SStefano Zampini       }
4901ed502f03SStefano Zampini     }
4902ed502f03SStefano Zampini 
4903ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4904ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4905ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
49069566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m + 1, &c->i));
49079566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->j));
4908ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4909ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4910ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4911ed502f03SStefano Zampini       ii = *Ccsr->row_offsets;
4912ed502f03SStefano Zampini       jj = *Ccsr->column_indices;
49139566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49149566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4915ed502f03SStefano Zampini     } else {
49169566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
49179566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost));
4918ed502f03SStefano Zampini     }
49199566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt)));
49209566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->ilen));
49219566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m, &c->imax));
4922ed502f03SStefano Zampini     c->maxnz         = c->nz;
4923ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4924ed502f03SStefano Zampini     c->rmax          = 0;
4925ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4926ed502f03SStefano Zampini       const PetscInt nn = c->i[i + 1] - c->i[i];
4927ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4928ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt) !!nn;
4929ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax, nn);
4930ed502f03SStefano Zampini     }
49319566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
49329566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz, &c->a));
4933ed502f03SStefano Zampini     (*C)->nonzerostate++;
49349566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
49359566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4936ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4937ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4938ed502f03SStefano Zampini   } else {
493908401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n);
4940ed502f03SStefano Zampini     c = (Mat_SeqAIJ *)(*C)->data;
4941ed502f03SStefano Zampini     if (c->nz) {
4942ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr;
49435f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cooPerm");
4944aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented");
494508401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate");
49469566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
49479566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
49485f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
49495f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct");
4950ed502f03SStefano Zampini       Acsr = (CsrMatrix *)Acusp->mat->mat;
4951ed502f03SStefano Zampini       Bcsr = (CsrMatrix *)Bcusp->mat->mat;
4952ed502f03SStefano Zampini       Ccsr = (CsrMatrix *)Ccusp->mat->mat;
4953aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size());
4954aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size());
4955aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size());
4956aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries);
49575f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->cooPerm->size(), (PetscInt)Ccsr->values->size());
4958ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4959ed502f03SStefano Zampini       thrust::advance(pmid, Acsr->num_entries);
49609566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
49619371c9d4SSatish Balay       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->begin())));
49629371c9d4SSatish Balay       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
4963ed502f03SStefano Zampini       thrust::for_each(zibait, zieait, VecCUDAEquals());
49649371c9d4SSatish Balay       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid)));
49659371c9d4SSatish Balay       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->cooPerm->end())));
4966ed502f03SStefano Zampini       thrust::for_each(zibbit, ziebit, VecCUDAEquals());
49679566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE));
49681a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
49695f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4970ed502f03SStefano Zampini         PetscBool  AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4971ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL;
4972ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL;
4973ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat;
4974ed502f03SStefano Zampini         auto       vT    = CcsrT->values->begin();
4975ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT);
4976ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT);
49771a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4978ed502f03SStefano Zampini       }
49799566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4980ed502f03SStefano Zampini     }
4981ed502f03SStefano Zampini   }
49829566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4983ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4984ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4985ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4986ed502f03SStefano Zampini   PetscFunctionReturn(0);
4987ed502f03SStefano Zampini }
4988c215019aSStefano Zampini 
4989*d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4990*d71ae5a4SJacob Faibussowitsch {
4991c215019aSStefano Zampini   bool               dmem;
4992c215019aSStefano Zampini   const PetscScalar *av;
4993c215019aSStefano Zampini 
4994c215019aSStefano Zampini   PetscFunctionBegin;
4995c215019aSStefano Zampini   dmem = isCudaMem(v);
49969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av));
4997c215019aSStefano Zampini   if (n && idx) {
4998c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4999c215019aSStefano Zampini     widx.assign(idx, idx + n);
50009566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
5001c215019aSStefano Zampini 
5002c215019aSStefano Zampini     THRUSTARRAY                    *w = NULL;
5003c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5004c215019aSStefano Zampini     if (dmem) {
5005c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5006c215019aSStefano Zampini     } else {
5007c215019aSStefano Zampini       w  = new THRUSTARRAY(n);
5008c215019aSStefano Zampini       dv = w->data();
5009c215019aSStefano Zampini     }
5010c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5011c215019aSStefano Zampini 
5012c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv));
5013c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n));
5014c215019aSStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
501548a46eb9SPierre Jolivet     if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost));
5016c215019aSStefano Zampini     delete w;
5017c215019aSStefano Zampini   } else {
50189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5019c215019aSStefano Zampini   }
50209566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
50219566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av));
5022c215019aSStefano Zampini   PetscFunctionReturn(0);
5023c215019aSStefano Zampini }
5024