xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision bc996fdc2f4503fc01f6a6476f80e02484e6569c)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94c215019aSStefano Zampini 
95bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
96ca45077fSPaul Mullowney {
97aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
986e111a19SKarl Rupp 
99ca45077fSPaul Mullowney   PetscFunctionBegin;
100ca45077fSPaul Mullowney   switch (op) {
101e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
102aa372e3fSPaul Mullowney     cusparsestruct->format = format;
103ca45077fSPaul Mullowney     break;
104e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
105aa372e3fSPaul Mullowney     cusparsestruct->format = format;
106ca45077fSPaul Mullowney     break;
107ca45077fSPaul Mullowney   default:
10898921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
109ca45077fSPaul Mullowney   }
110ca45077fSPaul Mullowney   PetscFunctionReturn(0);
111ca45077fSPaul Mullowney }
1129ae82921SPaul Mullowney 
113e057df02SPaul Mullowney /*@
114e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
115e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
116aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
117e057df02SPaul Mullowney    Not Collective
118e057df02SPaul Mullowney 
119e057df02SPaul Mullowney    Input Parameters:
1208468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
12136d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1222692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
123e057df02SPaul Mullowney 
124e057df02SPaul Mullowney    Output Parameter:
125e057df02SPaul Mullowney 
126e057df02SPaul Mullowney    Level: intermediate
127e057df02SPaul Mullowney 
128db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
129e057df02SPaul Mullowney @*/
130e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
131e057df02SPaul Mullowney {
132e057df02SPaul Mullowney   PetscFunctionBegin;
133e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
134cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
135e057df02SPaul Mullowney   PetscFunctionReturn(0);
136e057df02SPaul Mullowney }
137e057df02SPaul Mullowney 
138365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
139365b711fSMark Adams {
140365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
141365b711fSMark Adams 
142365b711fSMark Adams   PetscFunctionBegin;
143365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
144365b711fSMark Adams   PetscFunctionReturn(0);
145365b711fSMark Adams }
146365b711fSMark Adams 
147365b711fSMark Adams /*@
148365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
149365b711fSMark Adams 
150365b711fSMark Adams    Input Parameters:
151365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
152365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
153365b711fSMark Adams 
154365b711fSMark Adams    Output Parameter:
155365b711fSMark Adams 
156365b711fSMark Adams    Notes:
157365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
158365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
159365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
160365b711fSMark Adams 
161365b711fSMark Adams    Level: intermediate
162365b711fSMark Adams 
163db781477SPatrick Sanan .seealso: `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
164365b711fSMark Adams @*/
165365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
166365b711fSMark Adams {
167365b711fSMark Adams   PetscFunctionBegin;
168365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
169cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
170365b711fSMark Adams   PetscFunctionReturn(0);
171365b711fSMark Adams }
172365b711fSMark Adams 
1731a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
174e6e9a74fSStefano Zampini {
175e6e9a74fSStefano Zampini   PetscFunctionBegin;
1761a2c6b5cSJunchao Zhang   switch (op) {
1771a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
1781a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
1799566063dSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1801a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
1811a2c6b5cSJunchao Zhang       break;
1821a2c6b5cSJunchao Zhang     default:
1839566063dSJacob Faibussowitsch       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
1841a2c6b5cSJunchao Zhang       break;
185e6e9a74fSStefano Zampini   }
186e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
187e6e9a74fSStefano Zampini }
188e6e9a74fSStefano Zampini 
189bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
190bddcd29dSMark Adams 
191bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
192bddcd29dSMark Adams {
193bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
194bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
195bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
196365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
197bddcd29dSMark Adams 
198bddcd29dSMark Adams   PetscFunctionBegin;
1999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2009566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
201bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
202bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2039566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
2049566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
205f93f8571SJunchao Zhang 
206365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
207f93f8571SJunchao Zhang     if (row_identity && col_identity) {
208bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
209bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
210bddcd29dSMark Adams     } else {
211bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
212bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
213365b711fSMark Adams     }
214f93f8571SJunchao Zhang   }
215bddcd29dSMark Adams   B->ops->matsolve = NULL;
216bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
217bddcd29dSMark Adams 
218bddcd29dSMark Adams   /* get the triangular factors */
219365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
2209566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
221365b711fSMark Adams   }
222bddcd29dSMark Adams   PetscFunctionReturn(0);
223bddcd29dSMark Adams }
224bddcd29dSMark Adams 
2254416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2269ae82921SPaul Mullowney {
227e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2289ae82921SPaul Mullowney   PetscBool                flg;
229a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2306e111a19SKarl Rupp 
2319ae82921SPaul Mullowney   PetscFunctionBegin;
232d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
2339ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
234d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
235d0609cedSBarry Smith                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
2369566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
237afb2bd1cSJunchao Zhang 
238d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
239d0609cedSBarry Smith                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
2409566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
2419566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
2429566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
243afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
244d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
245d0609cedSBarry Smith                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
246afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
247ba986b86SSatish Balay #if CUSPARSE_VERSION > 11301
248aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
249a435da06SStefano Zampini #else
250aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
251a435da06SStefano Zampini #endif
252d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
253d0609cedSBarry Smith                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
254aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
255afb2bd1cSJunchao Zhang 
256d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
257d0609cedSBarry Smith                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
258aed4548fSBarry Smith     PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
259afb2bd1cSJunchao Zhang    #endif
2604c87dfd4SPaul Mullowney   }
261d0609cedSBarry Smith   PetscOptionsHeadEnd();
2629ae82921SPaul Mullowney   PetscFunctionReturn(0);
2639ae82921SPaul Mullowney }
2649ae82921SPaul Mullowney 
265087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
2669ae82921SPaul Mullowney {
2679ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
2689ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
2699ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
270aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
2719ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
2729ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
2739ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
2749ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
2759ae82921SPaul Mullowney 
2769ae82921SPaul Mullowney   PetscFunctionBegin;
277cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
278c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2799ae82921SPaul Mullowney     try {
2809ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
2819ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
282da79fbbcSStefano Zampini       if (!loTriFactor) {
2832cbc15d9SMark         PetscScalar                       *AALo;
2842cbc15d9SMark 
2859566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
2869ae82921SPaul Mullowney 
2879ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
2889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
2899566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
2909ae82921SPaul Mullowney 
2919ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
2929ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
2939ae82921SPaul Mullowney         AiLo[n]  = nzLower;
2949ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
2959ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
2969ae82921SPaul Mullowney         v        = aa;
2979ae82921SPaul Mullowney         vi       = aj;
2989ae82921SPaul Mullowney         offset   = 1;
2999ae82921SPaul Mullowney         rowOffset= 1;
3009ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3019ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
302e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3039ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3049ae82921SPaul Mullowney           rowOffset += nz+1;
3059ae82921SPaul Mullowney 
3069566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
3079566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
3089ae82921SPaul Mullowney 
3099ae82921SPaul Mullowney           offset      += nz;
3109ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3119ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3129ae82921SPaul Mullowney           offset      += 1;
3139ae82921SPaul Mullowney 
3149ae82921SPaul Mullowney           v  += nz;
3159ae82921SPaul Mullowney           vi += nz;
3169ae82921SPaul Mullowney         }
3172205254eSKarl Rupp 
318aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
3199566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
320da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
321aa372e3fSPaul Mullowney         /* Create the matrix description */
3229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
3239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
3241b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
3259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
326afb2bd1cSJunchao Zhang        #else
3279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
328afb2bd1cSJunchao Zhang        #endif
3299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
3309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
331aa372e3fSPaul Mullowney 
332aa372e3fSPaul Mullowney         /* set the operation */
333aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
334aa372e3fSPaul Mullowney 
335aa372e3fSPaul Mullowney         /* set the matrix */
336aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
337aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
338aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
339aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
340aa372e3fSPaul Mullowney 
341aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
342aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
343aa372e3fSPaul Mullowney 
344aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
345aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
346aa372e3fSPaul Mullowney 
347aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
348aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
349aa372e3fSPaul Mullowney 
350afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
3519566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
352261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
3531b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
354261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
355afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
356afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
357afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
3585f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
3599566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
360afb2bd1cSJunchao Zhang       #endif
361afb2bd1cSJunchao Zhang 
362aa372e3fSPaul Mullowney         /* perform the solve analysis */
363261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
364aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
365aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
366d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
3671b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
368d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
3695f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
370d49cd2b7SBarry Smith                                          #else
3715f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
372afb2bd1cSJunchao Zhang                                          #endif
3739566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
3749566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
375aa372e3fSPaul Mullowney 
376da79fbbcSStefano Zampini         /* assign the pointer */
377aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
3782cbc15d9SMark         loTriFactor->AA_h = AALo;
3799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
3809566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
3819566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
382da79fbbcSStefano Zampini       } else { /* update values only */
3832cbc15d9SMark         if (!loTriFactor->AA_h) {
3849566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
3852cbc15d9SMark         }
386da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
3872cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
388da79fbbcSStefano Zampini         v        = aa;
389da79fbbcSStefano Zampini         vi       = aj;
390da79fbbcSStefano Zampini         offset   = 1;
391da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
392da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
3939566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
394da79fbbcSStefano Zampini           offset      += nz;
3952cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
396da79fbbcSStefano Zampini           offset      += 1;
397da79fbbcSStefano Zampini           v  += nz;
398da79fbbcSStefano Zampini         }
3992cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
4009566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
401da79fbbcSStefano Zampini       }
4029ae82921SPaul Mullowney     } catch(char *ex) {
40398921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4049ae82921SPaul Mullowney     }
4059ae82921SPaul Mullowney   }
4069ae82921SPaul Mullowney   PetscFunctionReturn(0);
4079ae82921SPaul Mullowney }
4089ae82921SPaul Mullowney 
409087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4109ae82921SPaul Mullowney {
4119ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4129ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4139ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
414aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4159ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4169ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4179ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4189ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4199ae82921SPaul Mullowney 
4209ae82921SPaul Mullowney   PetscFunctionBegin;
421cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
422c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4239ae82921SPaul Mullowney     try {
4249ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4259ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
426da79fbbcSStefano Zampini       if (!upTriFactor) {
4272cbc15d9SMark         PetscScalar *AAUp;
4282cbc15d9SMark 
4299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
4302cbc15d9SMark 
4319ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
4329566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
4339566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
4349ae82921SPaul Mullowney 
4359ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
4369ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
4379ae82921SPaul Mullowney         AiUp[n]=nzUpper;
4389ae82921SPaul Mullowney         offset = nzUpper;
4399ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
4409ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
4419ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
4429ae82921SPaul Mullowney 
443e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
4449ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
4459ae82921SPaul Mullowney 
446e057df02SPaul Mullowney           /* decrement the offset */
4479ae82921SPaul Mullowney           offset -= (nz+1);
4489ae82921SPaul Mullowney 
449e057df02SPaul Mullowney           /* first, set the diagonal elements */
4509ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
45109f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
4529ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
4539ae82921SPaul Mullowney 
4549566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
4559566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
4569ae82921SPaul Mullowney         }
4572205254eSKarl Rupp 
458aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4599566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
460da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
4612205254eSKarl Rupp 
462aa372e3fSPaul Mullowney         /* Create the matrix description */
4639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
4649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4651b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
467afb2bd1cSJunchao Zhang        #else
4689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
469afb2bd1cSJunchao Zhang        #endif
4709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
4719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
472aa372e3fSPaul Mullowney 
473aa372e3fSPaul Mullowney         /* set the operation */
474aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
475aa372e3fSPaul Mullowney 
476aa372e3fSPaul Mullowney         /* set the matrix */
477aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
478aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
479aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
480aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
481aa372e3fSPaul Mullowney 
482aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
483aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
484aa372e3fSPaul Mullowney 
485aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
486aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
487aa372e3fSPaul Mullowney 
488aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
489aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
490aa372e3fSPaul Mullowney 
491afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4929566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
493261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
4941b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
495261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
496afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
497afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
498afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
4995f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
5009566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
501afb2bd1cSJunchao Zhang       #endif
502afb2bd1cSJunchao Zhang 
503aa372e3fSPaul Mullowney         /* perform the solve analysis */
504261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
505aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
506aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
507d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
5081b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
509d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
5105f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
511d49cd2b7SBarry Smith                                          #else
5125f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
513afb2bd1cSJunchao Zhang                                          #endif
5149566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
5159566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
516aa372e3fSPaul Mullowney 
517da79fbbcSStefano Zampini         /* assign the pointer */
518aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
5192cbc15d9SMark         upTriFactor->AA_h = AAUp;
5209566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
5219566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
5229566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
523da79fbbcSStefano Zampini       } else {
5242cbc15d9SMark         if (!upTriFactor->AA_h) {
5259566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
5262cbc15d9SMark         }
527da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
528da79fbbcSStefano Zampini         offset = nzUpper;
529da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
530da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
531da79fbbcSStefano Zampini 
532da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
533da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
534da79fbbcSStefano Zampini 
535da79fbbcSStefano Zampini           /* decrement the offset */
536da79fbbcSStefano Zampini           offset -= (nz+1);
537da79fbbcSStefano Zampini 
538da79fbbcSStefano Zampini           /* first, set the diagonal elements */
5392cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
5409566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
541da79fbbcSStefano Zampini         }
5422cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
5439566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
544da79fbbcSStefano Zampini       }
5459ae82921SPaul Mullowney     } catch(char *ex) {
54698921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5479ae82921SPaul Mullowney     }
5489ae82921SPaul Mullowney   }
5499ae82921SPaul Mullowney   PetscFunctionReturn(0);
5509ae82921SPaul Mullowney }
5519ae82921SPaul Mullowney 
552087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
5539ae82921SPaul Mullowney {
5549ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
5559ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5569ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
5579ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
5589ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
5599ae82921SPaul Mullowney 
5609ae82921SPaul Mullowney   PetscFunctionBegin;
56128b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
5629566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
5639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
5642205254eSKarl Rupp 
565da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
566aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
5679ae82921SPaul Mullowney 
568c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
569e057df02SPaul Mullowney   /* lower triangular indices */
5709566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
571da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
572da79fbbcSStefano Zampini     const PetscInt *r;
573da79fbbcSStefano Zampini 
5749566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow,&r));
575aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
576aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
5779566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow,&r));
5789566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
579da79fbbcSStefano Zampini   }
5809ae82921SPaul Mullowney 
581e057df02SPaul Mullowney   /* upper triangular indices */
5829566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
583da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
584da79fbbcSStefano Zampini     const PetscInt *c;
585da79fbbcSStefano Zampini 
5869566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol,&c));
587aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
588aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
5899566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol,&c));
5909566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
591da79fbbcSStefano Zampini   }
5929ae82921SPaul Mullowney   PetscFunctionReturn(0);
5939ae82921SPaul Mullowney }
5949ae82921SPaul Mullowney 
595087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
596087f3262SPaul Mullowney {
597087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
598087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
599aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
600aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
601087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
602087f3262SPaul Mullowney   PetscScalar                       *AAUp;
603087f3262SPaul Mullowney   PetscScalar                       *AALo;
604087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
605087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
606087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
607087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
608087f3262SPaul Mullowney 
609087f3262SPaul Mullowney   PetscFunctionBegin;
610cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
611c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
612087f3262SPaul Mullowney     try {
6139566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
6149566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
615da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
616087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
6179566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
6189566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
619087f3262SPaul Mullowney 
620087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
621087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
622087f3262SPaul Mullowney         AiUp[n]=nzUpper;
623087f3262SPaul Mullowney         offset = 0;
624087f3262SPaul Mullowney         for (i=0; i<n; i++) {
625087f3262SPaul Mullowney           /* set the pointers */
626087f3262SPaul Mullowney           v  = aa + ai[i];
627087f3262SPaul Mullowney           vj = aj + ai[i];
628087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
629087f3262SPaul Mullowney 
630087f3262SPaul Mullowney           /* first, set the diagonal elements */
631087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
63209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
633087f3262SPaul Mullowney           AiUp[i]      = offset;
63409f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
635087f3262SPaul Mullowney 
636087f3262SPaul Mullowney           offset+=1;
637087f3262SPaul Mullowney           if (nz>0) {
6389566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
6399566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
640087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
641087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
642087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
643087f3262SPaul Mullowney             }
644087f3262SPaul Mullowney             offset+=nz;
645087f3262SPaul Mullowney           }
646087f3262SPaul Mullowney         }
647087f3262SPaul Mullowney 
648aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6499566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
650da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
651087f3262SPaul Mullowney 
652aa372e3fSPaul Mullowney         /* Create the matrix description */
6539566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6551b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
6569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
657afb2bd1cSJunchao Zhang        #else
6589566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
659afb2bd1cSJunchao Zhang        #endif
6609566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
662087f3262SPaul Mullowney 
663aa372e3fSPaul Mullowney         /* set the matrix */
664aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
665aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
666aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
667aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
668aa372e3fSPaul Mullowney 
669aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
670aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
671aa372e3fSPaul Mullowney 
672aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
673aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
674aa372e3fSPaul Mullowney 
675aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
676aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
677aa372e3fSPaul Mullowney 
678afb2bd1cSJunchao Zhang         /* set the operation */
679afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
680afb2bd1cSJunchao Zhang 
681afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6829566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
683261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo));
6841b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
685261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
686afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
687afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
688afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
6895f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
6909566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
691afb2bd1cSJunchao Zhang       #endif
692afb2bd1cSJunchao Zhang 
693aa372e3fSPaul Mullowney         /* perform the solve analysis */
694261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
695aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
696aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
697d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
6981b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
699d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
7005f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
701d49cd2b7SBarry Smith                                          #else
7025f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
703afb2bd1cSJunchao Zhang                                          #endif
7049566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7059566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
706aa372e3fSPaul Mullowney 
707da79fbbcSStefano Zampini         /* assign the pointer */
708aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
709aa372e3fSPaul Mullowney 
710aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
7119566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
712da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
713aa372e3fSPaul Mullowney 
714aa372e3fSPaul Mullowney         /* Create the matrix description */
7159566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
7169566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
7171b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
7189566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
719afb2bd1cSJunchao Zhang        #else
7209566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
721afb2bd1cSJunchao Zhang        #endif
7229566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
7239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
724aa372e3fSPaul Mullowney 
725aa372e3fSPaul Mullowney         /* set the operation */
726aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
727aa372e3fSPaul Mullowney 
728aa372e3fSPaul Mullowney         /* set the matrix */
729aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
730aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
731aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
732aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
733aa372e3fSPaul Mullowney 
734aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
735aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
736aa372e3fSPaul Mullowney 
737aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
738aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
739aa372e3fSPaul Mullowney 
740aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
741aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
742aa372e3fSPaul Mullowney 
743afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7449566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
745261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo));
7461b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
747261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
748afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
749afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
750afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
7515f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
7529566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
753afb2bd1cSJunchao Zhang       #endif
754afb2bd1cSJunchao Zhang 
755aa372e3fSPaul Mullowney         /* perform the solve analysis */
756261a78b4SJunchao Zhang         PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
757aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
758aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
759d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
7601b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
761d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
7625f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
763d49cd2b7SBarry Smith                                          #else
7645f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
765afb2bd1cSJunchao Zhang                                          #endif
7669566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
7679566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
768aa372e3fSPaul Mullowney 
769da79fbbcSStefano Zampini         /* assign the pointer */
770aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
771087f3262SPaul Mullowney 
7729566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
7739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
7749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
775da79fbbcSStefano Zampini       } else {
776da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
777da79fbbcSStefano Zampini         offset = 0;
778da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
779da79fbbcSStefano Zampini           /* set the pointers */
780da79fbbcSStefano Zampini           v  = aa + ai[i];
781da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
782da79fbbcSStefano Zampini 
783da79fbbcSStefano Zampini           /* first, set the diagonal elements */
784da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
785da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
786da79fbbcSStefano Zampini 
787da79fbbcSStefano Zampini           offset+=1;
788da79fbbcSStefano Zampini           if (nz>0) {
7899566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
790da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
791da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
792da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
793da79fbbcSStefano Zampini             }
794da79fbbcSStefano Zampini             offset+=nz;
795da79fbbcSStefano Zampini           }
796da79fbbcSStefano Zampini         }
79728b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
79828b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
799da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
800da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
8019566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
802da79fbbcSStefano Zampini       }
8039566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
8049566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
805087f3262SPaul Mullowney     } catch(char *ex) {
80698921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
807087f3262SPaul Mullowney     }
808087f3262SPaul Mullowney   }
809087f3262SPaul Mullowney   PetscFunctionReturn(0);
810087f3262SPaul Mullowney }
811087f3262SPaul Mullowney 
812087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8139ae82921SPaul Mullowney {
814087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
815087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
816087f3262SPaul Mullowney   IS                           ip = a->row;
817087f3262SPaul Mullowney   PetscBool                    perm_identity;
818087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
819087f3262SPaul Mullowney 
820087f3262SPaul Mullowney   PetscFunctionBegin;
82128b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
8229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
823da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
824aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
825aa372e3fSPaul Mullowney 
826da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
827da79fbbcSStefano Zampini 
828087f3262SPaul Mullowney   /* lower triangular indices */
8299566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
830087f3262SPaul Mullowney   if (!perm_identity) {
8314e4bbfaaSStefano Zampini     IS             iip;
832da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
8334e4bbfaaSStefano Zampini 
8349566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
8359566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip,&irip));
8369566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip,&rip));
837aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
838aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
839aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
8404e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
8419566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip,&irip));
8429566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
8439566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip,&rip));
8449566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
845da79fbbcSStefano Zampini   }
846087f3262SPaul Mullowney   PetscFunctionReturn(0);
847087f3262SPaul Mullowney }
848087f3262SPaul Mullowney 
849087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
850087f3262SPaul Mullowney {
851087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
852087f3262SPaul Mullowney   IS             ip = b->row;
853087f3262SPaul Mullowney   PetscBool      perm_identity;
854087f3262SPaul Mullowney 
855087f3262SPaul Mullowney   PetscFunctionBegin;
8569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
8579566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
858ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
859087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8609566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
861087f3262SPaul Mullowney   if (perm_identity) {
862087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
863087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8644e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8654e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
866087f3262SPaul Mullowney   } else {
867087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
868087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
8694e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8704e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
871087f3262SPaul Mullowney   }
872087f3262SPaul Mullowney 
873087f3262SPaul Mullowney   /* get the triangular factors */
8749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
875087f3262SPaul Mullowney   PetscFunctionReturn(0);
876087f3262SPaul Mullowney }
8779ae82921SPaul Mullowney 
878b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
879bda325fcSPaul Mullowney {
880bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
881aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
882aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
883da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
884da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
885aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
886aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
887aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
888aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
889b175d8bbSPaul Mullowney 
890bda325fcSPaul Mullowney   PetscFunctionBegin;
891aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
8929566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
893da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
894aa372e3fSPaul Mullowney 
895aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
896aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
897aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
898aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
899aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
900aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
901aa372e3fSPaul Mullowney 
902aa372e3fSPaul Mullowney   /* Create the matrix description */
9039566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
9049566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
9059566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
9069566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
9079566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
908aa372e3fSPaul Mullowney 
909aa372e3fSPaul Mullowney   /* set the operation */
910aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
911aa372e3fSPaul Mullowney 
912aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
913aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
914afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
915afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
916aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
917afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
918afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
919afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
920aa372e3fSPaul Mullowney 
921aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
922afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
9239566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
924afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
925afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(),
926afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->row_offsets->data().get(),
927afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(),
928afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->values->data().get(),
929afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
930afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
9315f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
9329566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
933afb2bd1cSJunchao Zhang #endif
934afb2bd1cSJunchao Zhang 
9359566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
9369566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
937aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
938aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->values->data().get(),
939aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->row_offsets->data().get(),
940aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->column_indices->data().get(),
941aa372e3fSPaul Mullowney                                   loTriFactorT->csrMat->values->data().get(),
942afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
943afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
944afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
9455f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
946afb2bd1cSJunchao Zhang                                   #else
947afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
9485f80ce2aSJacob Faibussowitsch                                   CUSPARSE_ACTION_NUMERIC, indexBase));
949afb2bd1cSJunchao Zhang                                   #endif
9509566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9519566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
952aa372e3fSPaul Mullowney 
953afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
9549566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
955261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo));
9561b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
957261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
958afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
959afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
960afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
9615f80ce2aSJacob Faibussowitsch                                          &loTriFactorT->solveBufferSize));
9629566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
963afb2bd1cSJunchao Zhang #endif
964afb2bd1cSJunchao Zhang 
965afb2bd1cSJunchao Zhang   /* perform the solve analysis */
966261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
967afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
968afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
969d49cd2b7SBarry Smith                                    loTriFactorT->csrMat->column_indices->data().get(),
9701b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
971d49cd2b7SBarry Smith                                    loTriFactorT->solveInfo,
9725f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
973d49cd2b7SBarry Smith                                    #else
9745f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solveInfo));
975afb2bd1cSJunchao Zhang                                    #endif
9769566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
9779566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
978aa372e3fSPaul Mullowney 
979da79fbbcSStefano Zampini   /* assign the pointer */
980aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
981aa372e3fSPaul Mullowney 
982aa372e3fSPaul Mullowney   /*********************************************/
983aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
984aa372e3fSPaul Mullowney   /*********************************************/
985aa372e3fSPaul Mullowney 
986aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
9879566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
988da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
989aa372e3fSPaul Mullowney 
990aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
991aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
992aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
993aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
994aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
995aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
996aa372e3fSPaul Mullowney 
997aa372e3fSPaul Mullowney   /* Create the matrix description */
9989566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
9999566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
10009566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
10019566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
10029566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1003aa372e3fSPaul Mullowney 
1004aa372e3fSPaul Mullowney   /* set the operation */
1005aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1006aa372e3fSPaul Mullowney 
1007aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1008aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1009afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1010afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1011aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1012afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1013afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1014afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1015aa372e3fSPaul Mullowney 
1016aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1017afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
10189566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1019afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1020afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(),
1021afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->row_offsets->data().get(),
1022afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(),
1023afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->values->data().get(),
1024afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1025afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
10265f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
10279566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1028afb2bd1cSJunchao Zhang #endif
1029afb2bd1cSJunchao Zhang 
10309566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
10319566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1032aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1033aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->values->data().get(),
1034aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->row_offsets->data().get(),
1035aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->column_indices->data().get(),
1036aa372e3fSPaul Mullowney                                   upTriFactorT->csrMat->values->data().get(),
1037afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
10405f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1041afb2bd1cSJunchao Zhang                                   #else
1042afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
10435f80ce2aSJacob Faibussowitsch                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1044afb2bd1cSJunchao Zhang                                  #endif
1045d49cd2b7SBarry Smith 
10469566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10479566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1048aa372e3fSPaul Mullowney 
1049afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
10509566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1051261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo));
10521b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1053261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1054afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1055afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1056afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
10575f80ce2aSJacob Faibussowitsch                                          &upTriFactorT->solveBufferSize));
10589566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1059afb2bd1cSJunchao Zhang   #endif
1060afb2bd1cSJunchao Zhang 
1061afb2bd1cSJunchao Zhang   /* perform the solve analysis */
10625f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
1063261a78b4SJunchao Zhang   PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1064afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1065afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1066d49cd2b7SBarry Smith                                    upTriFactorT->csrMat->column_indices->data().get(),
10671b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1068d49cd2b7SBarry Smith                                    upTriFactorT->solveInfo,
10695f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1070d49cd2b7SBarry Smith                                    #else
10715f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solveInfo));
1072afb2bd1cSJunchao Zhang                                    #endif
1073d49cd2b7SBarry Smith 
10749566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10759566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1076aa372e3fSPaul Mullowney 
1077da79fbbcSStefano Zampini   /* assign the pointer */
1078aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1079bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1080bda325fcSPaul Mullowney }
1081bda325fcSPaul Mullowney 
1082a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1083a49f1ed0SStefano Zampini {
1084a49f1ed0SStefano Zampini   __host__ __device__
1085a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1086a49f1ed0SStefano Zampini   {
1087a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1088a49f1ed0SStefano Zampini   }
1089a49f1ed0SStefano Zampini };
1090a49f1ed0SStefano Zampini 
10913606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1092bda325fcSPaul Mullowney {
1093aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1094a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1095bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1096bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1097aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1098b175d8bbSPaul Mullowney 
1099bda325fcSPaul Mullowney   PetscFunctionBegin;
11009566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1101a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
110228b400f6SJacob Faibussowitsch   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1103a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
110408401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
11051a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
11069566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
11079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1108a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
11099566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1110a49f1ed0SStefano Zampini   }
1111a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1112aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
11139566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1114aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
11159566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
11169566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1117aa372e3fSPaul Mullowney 
1118b06137fdSPaul Mullowney     /* set alpha and beta */
11199566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
11209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
11219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
11229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
11239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
11249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1125b06137fdSPaul Mullowney 
1126aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1127aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1128a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1129554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1130554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1131aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1132a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1133aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1134aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1135a3fdcf43SKarl Rupp 
1136039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
113781902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1138afb2bd1cSJunchao Zhang 
1139afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
11403606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1141afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1142afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1143afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1144afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1145afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
11469566063dSJacob Faibussowitsch                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
11473606e59fSJunchao Zhang       #else
11483606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
11493606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
11503606e59fSJunchao Zhang 
11513606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
11523606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
11533606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
11543606e59fSJunchao Zhang         */
11553606e59fSJunchao Zhang         if (matrixT->num_entries) {
11563606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
11573606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
11583606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
11593606e59fSJunchao Zhang                                  matrixT->values->data().get(),
11603606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
11619566063dSJacob Faibussowitsch                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
11623606e59fSJunchao Zhang 
11633606e59fSJunchao Zhang         } else {
11643606e59fSJunchao Zhang           matstructT->matDescr = NULL;
11653606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
11663606e59fSJunchao Zhang         }
11673606e59fSJunchao Zhang       #endif
1168afb2bd1cSJunchao Zhang      #endif
1169aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1170afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1171afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1172afb2bd1cSJunchao Zhang    #else
1173aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
117451c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
117551c6d536SStefano Zampini       /* First convert HYB to CSR */
1176aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1177aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1178aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1179aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1180aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1181aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1182aa372e3fSPaul Mullowney 
1183aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1184aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1185aa372e3fSPaul Mullowney                               temp->values->data().get(),
1186aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
11879566063dSJacob Faibussowitsch                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1188aa372e3fSPaul Mullowney 
1189aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1190aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1191aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1192aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1193aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1194aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1195aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1196aa372e3fSPaul Mullowney 
1197aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1198aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1199aa372e3fSPaul Mullowney                               temp->values->data().get(),
1200aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1201aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1202aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1203aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1204aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
12059566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1206aa372e3fSPaul Mullowney 
1207aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1208aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
12099566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1210aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1211aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1212aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1213aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1214aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1215aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
12169566063dSJacob Faibussowitsch                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1217aa372e3fSPaul Mullowney 
1218aa372e3fSPaul Mullowney       /* assign the pointer */
1219aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
12201a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1221aa372e3fSPaul Mullowney       /* delete temporaries */
1222aa372e3fSPaul Mullowney       if (tempT) {
1223aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1224aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1225aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1226aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1227087f3262SPaul Mullowney       }
1228aa372e3fSPaul Mullowney       if (temp) {
1229aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1230aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1231aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1232aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1233aa372e3fSPaul Mullowney       }
1234afb2bd1cSJunchao Zhang      #endif
1235aa372e3fSPaul Mullowney     }
1236a49f1ed0SStefano Zampini   }
1237a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1238a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1239a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
124028b400f6SJacob Faibussowitsch     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
124128b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
124228b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
124328b400f6SJacob Faibussowitsch     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
124428b400f6SJacob Faibussowitsch     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
124528b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
124628b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
124728b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1248a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1249a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1250a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
12519566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1252a49f1ed0SStefano Zampini     }
1253a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1254a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1255a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1256a49f1ed0SStefano Zampini 
1257a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1258a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1259a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1260a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1261a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1262a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1263a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1264a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1265a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1266a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1267a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1268a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
12699566063dSJacob Faibussowitsch                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
12709566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1271a49f1ed0SStefano Zampini      #endif
1272a49f1ed0SStefano Zampini 
12731a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
12741a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
12751a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
12761a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
12771a2c6b5cSJunchao Zhang 
12781a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
12791a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
12801a2c6b5cSJunchao Zhang         */
12811a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
12821a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
12831a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
12841a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
12851a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1286a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1287a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1288a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1289a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
12909566063dSJacob Faibussowitsch                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1291a49f1ed0SStefano Zampini                              #else
1292a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
12939566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1294a49f1ed0SStefano Zampini                              #endif
12951a2c6b5cSJunchao Zhang       } else {
12961a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
12971a2c6b5cSJunchao Zhang       }
12981a2c6b5cSJunchao Zhang 
1299a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1300a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1301a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13029566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1303a49f1ed0SStefano Zampini      #endif
1304a49f1ed0SStefano Zampini     }
1305a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1306a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1307a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1308a49f1ed0SStefano Zampini   }
13099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
13109566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1311213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1312213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1313aa372e3fSPaul Mullowney   /* assign the pointer */
1314aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
13151a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1316bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1317bda325fcSPaul Mullowney }
1318bda325fcSPaul Mullowney 
1319a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13206fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1321bda325fcSPaul Mullowney {
1322c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1323465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1324465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1325465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1326465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1327bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1328bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1329aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1330aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1331aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1332bda325fcSPaul Mullowney 
1333bda325fcSPaul Mullowney   PetscFunctionBegin;
1334aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1335aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
13369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1337aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1338aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1339bda325fcSPaul Mullowney   }
1340bda325fcSPaul Mullowney 
1341bda325fcSPaul Mullowney   /* Get the GPU pointers */
13429566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
13439566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1344c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1345c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1346bda325fcSPaul Mullowney 
13479566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1348aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1349a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1350c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1351c41cb2e2SAlejandro Lamas Daviña                xGPU);
1352aa372e3fSPaul Mullowney 
1353aa372e3fSPaul Mullowney   /* First, solve U */
1354261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1355afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13561b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1357afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1358afb2bd1cSJunchao Zhang                       #endif
1359afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1360aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1361aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1362aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1363aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1364d49cd2b7SBarry Smith                         xarray,
13651b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1366d49cd2b7SBarry Smith                         tempGPU->data().get(),
13679566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1368d49cd2b7SBarry Smith                       #else
13699566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1370afb2bd1cSJunchao Zhang                       #endif
1371aa372e3fSPaul Mullowney 
1372aa372e3fSPaul Mullowney   /* Then, solve L */
1373261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1374afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
13751b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1376afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1377afb2bd1cSJunchao Zhang                       #endif
1378afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1379aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1380aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1381aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1382aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1383d49cd2b7SBarry Smith                         tempGPU->data().get(),
13841b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1385d49cd2b7SBarry Smith                         xarray,
13869566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1387d49cd2b7SBarry Smith                       #else
13889566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1389afb2bd1cSJunchao Zhang                       #endif
1390aa372e3fSPaul Mullowney 
1391aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1392a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1393c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1394aa372e3fSPaul Mullowney                tempGPU->begin());
1395aa372e3fSPaul Mullowney 
1396aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1397a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1398bda325fcSPaul Mullowney 
1399bda325fcSPaul Mullowney   /* restore */
14009566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
14019566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
14029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1404bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1405bda325fcSPaul Mullowney }
1406bda325fcSPaul Mullowney 
14076fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1408bda325fcSPaul Mullowney {
1409465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1410465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1411bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1412bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1413aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1414aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1415aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1416bda325fcSPaul Mullowney 
1417bda325fcSPaul Mullowney   PetscFunctionBegin;
1418aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1419aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
14209566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1421aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423bda325fcSPaul Mullowney   }
1424bda325fcSPaul Mullowney 
1425bda325fcSPaul Mullowney   /* Get the GPU pointers */
14269566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
14279566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1428bda325fcSPaul Mullowney 
14299566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1430aa372e3fSPaul Mullowney   /* First, solve U */
1431261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1432afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14331b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1434afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1435afb2bd1cSJunchao Zhang                       #endif
1436afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1437aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1438aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1439aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1440aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1441d49cd2b7SBarry Smith                         barray,
14421b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1443d49cd2b7SBarry Smith                         tempGPU->data().get(),
14449566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1445d49cd2b7SBarry Smith                       #else
14469566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1447afb2bd1cSJunchao Zhang                       #endif
1448aa372e3fSPaul Mullowney 
1449aa372e3fSPaul Mullowney   /* Then, solve L */
1450261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1451afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14521b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1454afb2bd1cSJunchao Zhang                       #endif
1455afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1456aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1457aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1458aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1459aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1460d49cd2b7SBarry Smith                         tempGPU->data().get(),
14611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1462d49cd2b7SBarry Smith                         xarray,
14639566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1464d49cd2b7SBarry Smith                       #else
14659566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1466afb2bd1cSJunchao Zhang                       #endif
1467bda325fcSPaul Mullowney 
1468bda325fcSPaul Mullowney   /* restore */
14699566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
14709566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
14719566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14729566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1473bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1474bda325fcSPaul Mullowney }
1475bda325fcSPaul Mullowney 
14766fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
14779ae82921SPaul Mullowney {
1478465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1479465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1480465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1481465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
14829ae82921SPaul Mullowney   cusparseStatus_t                      stat;
14839ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1484aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1485aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1486aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
14879ae82921SPaul Mullowney 
14889ae82921SPaul Mullowney   PetscFunctionBegin;
1489ebc8f436SDominic Meiser 
1490e057df02SPaul Mullowney   /* Get the GPU pointers */
14919566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
14929566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1493c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1494c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
14959ae82921SPaul Mullowney 
14969566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1497aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1498a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1499c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15004e4bbfaaSStefano Zampini                tempGPU->begin());
1501aa372e3fSPaul Mullowney 
1502aa372e3fSPaul Mullowney   /* Next, solve L */
1503261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1504afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15051b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1506afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1507afb2bd1cSJunchao Zhang                       #endif
1508afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1509aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1510aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1511aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1512aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1513d49cd2b7SBarry Smith                         tempGPU->data().get(),
15141b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1515d49cd2b7SBarry Smith                          xarray,
15169566063dSJacob Faibussowitsch                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1517d49cd2b7SBarry Smith                       #else
15189566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1519afb2bd1cSJunchao Zhang                       #endif
1520aa372e3fSPaul Mullowney 
1521aa372e3fSPaul Mullowney   /* Then, solve U */
1522261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1523afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15241b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1525afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1526afb2bd1cSJunchao Zhang                       #endif
1527afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1528aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1529aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1530aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1531d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
15321b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533d49cd2b7SBarry Smith                         tempGPU->data().get(),
15349566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1535d49cd2b7SBarry Smith                       #else
15369566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1537afb2bd1cSJunchao Zhang                       #endif
1538d49cd2b7SBarry Smith 
15394e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1540a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15414e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15424e4bbfaaSStefano Zampini                xGPU);
15439ae82921SPaul Mullowney 
15449566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
15459566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
15469566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15479566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
15489ae82921SPaul Mullowney   PetscFunctionReturn(0);
15499ae82921SPaul Mullowney }
15509ae82921SPaul Mullowney 
15516fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15529ae82921SPaul Mullowney {
1553465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1554465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15559ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15569ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1557aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1558aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1559aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
15609ae82921SPaul Mullowney 
15619ae82921SPaul Mullowney   PetscFunctionBegin;
1562e057df02SPaul Mullowney   /* Get the GPU pointers */
15639566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
15649566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
15659ae82921SPaul Mullowney 
15669566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1567aa372e3fSPaul Mullowney   /* First, solve L */
1568261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1569afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15701b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1572afb2bd1cSJunchao Zhang                       #endif
1573afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1574aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1575aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1576aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1577aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1578d49cd2b7SBarry Smith                         barray,
15791b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1580d49cd2b7SBarry Smith                         tempGPU->data().get(),
15819566063dSJacob Faibussowitsch                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1582d49cd2b7SBarry Smith                       #else
15839566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1584afb2bd1cSJunchao Zhang                       #endif
1585d49cd2b7SBarry Smith 
1586aa372e3fSPaul Mullowney   /* Next, solve U */
1587261a78b4SJunchao Zhang   stat = cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1588afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15891b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1590afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1591afb2bd1cSJunchao Zhang                       #endif
1592afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1593aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1594aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1595aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1596aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1597d49cd2b7SBarry Smith                         tempGPU->data().get(),
15981b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599d49cd2b7SBarry Smith                         xarray,
16009566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1601d49cd2b7SBarry Smith                       #else
16029566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1603afb2bd1cSJunchao Zhang                       #endif
16049ae82921SPaul Mullowney 
16059566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
16069566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
16079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
16099ae82921SPaul Mullowney   PetscFunctionReturn(0);
16109ae82921SPaul Mullowney }
16119ae82921SPaul Mullowney 
1612da112707SJunchao Zhang #if CUSPARSE_VERSION >= 11500
1613da112707SJunchao Zhang /* cusparseSpSV_solve() and friends first appeared in cusparse-11.3 */
1614da112707SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1615da112707SJunchao Zhang {
1616da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1617da112707SJunchao Zhang   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1618da112707SJunchao Zhang   const PetscScalar             *barray;
1619da112707SJunchao Zhang   PetscScalar                   *xarray;
1620da112707SJunchao Zhang 
1621da112707SJunchao Zhang   PetscFunctionBegin;
1622da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1623da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b,&barray));
1624da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1625da112707SJunchao Zhang 
1626da112707SJunchao Zhang   /* Solve L*y = b */
1627da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1628da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1629da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1630da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1631da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1632da112707SJunchao Zhang                     fs->spMatDescr_L, /* L Y = X */
1633da112707SJunchao Zhang                     fs->dnVecDescr_X,
1634da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1635da112707SJunchao Zhang                     cusparse_scalartype,
1636da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
163712ba2bc6SJunchao Zhang                     fs->spsvDescr_L)); // cusparseSpSV_solve() scretely uses the external buffer used in cusparseSpSV_analysis()!
1638da112707SJunchao Zhang 
1639da112707SJunchao Zhang   /* Solve U*x = y */
1640da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1641da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1642da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1643da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1644da112707SJunchao Zhang                     fs->spMatDescr_U, /* U X = Y */
1645da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1646da112707SJunchao Zhang                     fs->dnVecDescr_X,
1647da112707SJunchao Zhang                     cusparse_scalartype,
1648da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1649da112707SJunchao Zhang                     fs->spsvDescr_U));
1650da112707SJunchao Zhang 
1651da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1652da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1653da112707SJunchao Zhang 
1654da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1655da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1656da112707SJunchao Zhang   PetscFunctionReturn(0);
1657da112707SJunchao Zhang }
1658da112707SJunchao Zhang 
1659da112707SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_ILU0(Mat fact,Vec b,Vec x)
1660da112707SJunchao Zhang {
1661da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1662da112707SJunchao Zhang   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
1663da112707SJunchao Zhang   const PetscScalar             *barray;
1664da112707SJunchao Zhang   PetscScalar                   *xarray;
1665da112707SJunchao Zhang 
1666da112707SJunchao Zhang   PetscFunctionBegin;
166712ba2bc6SJunchao Zhang   if (!fs->createdTransposeSpSVDescr) { /* Call MatSolveTranspose() for the first time */
1668da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
1669da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1670da112707SJunchao Zhang                       CUSPARSE_OPERATION_TRANSPOSE,
1671da112707SJunchao Zhang                       &PETSC_CUSPARSE_ONE,
1672da112707SJunchao Zhang                       fs->spMatDescr_L, /* The matrix is still L. We only do tranpose solve with it */
1673da112707SJunchao Zhang                       fs->dnVecDescr_X,
1674da112707SJunchao Zhang                       fs->dnVecDescr_Y,
1675da112707SJunchao Zhang                       cusparse_scalartype,
1676da112707SJunchao Zhang                       CUSPARSE_SPSV_ALG_DEFAULT,
1677da112707SJunchao Zhang                       fs->spsvDescr_Lt,
1678da112707SJunchao Zhang                       &fs->spsvBufferSize_Lt));
1679da112707SJunchao Zhang 
1680da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut));
1681da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1682da112707SJunchao Zhang                       CUSPARSE_OPERATION_TRANSPOSE,
1683da112707SJunchao Zhang                       &PETSC_CUSPARSE_ONE,
1684da112707SJunchao Zhang                       fs->spMatDescr_U,
1685da112707SJunchao Zhang                       fs->dnVecDescr_X,
1686da112707SJunchao Zhang                       fs->dnVecDescr_Y,
1687da112707SJunchao Zhang                       cusparse_scalartype,
1688da112707SJunchao Zhang                       CUSPARSE_SPSV_ALG_DEFAULT,
1689da112707SJunchao Zhang                       fs->spsvDescr_Ut,
1690da112707SJunchao Zhang                       &fs->spsvBufferSize_Ut));
1691da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
169212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Ut,fs->spsvBufferSize_Ut));
169312ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr = PETSC_TRUE;
169412ba2bc6SJunchao Zhang   }
1695da112707SJunchao Zhang 
169612ba2bc6SJunchao Zhang   if (!fs->updatedTransposeSpSVAnalysis) {
1697da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1698da112707SJunchao Zhang                     CUSPARSE_OPERATION_TRANSPOSE,
1699da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1700da112707SJunchao Zhang                     fs->spMatDescr_L,
1701da112707SJunchao Zhang                     fs->dnVecDescr_X,
1702da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1703da112707SJunchao Zhang                     cusparse_scalartype,
1704da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1705da112707SJunchao Zhang                     fs->spsvDescr_Lt,
1706da112707SJunchao Zhang                     fs->spsvBuffer_Lt));
1707da112707SJunchao Zhang 
1708da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1709da112707SJunchao Zhang                     CUSPARSE_OPERATION_TRANSPOSE,
1710da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1711da112707SJunchao Zhang                     fs->spMatDescr_U,
1712da112707SJunchao Zhang                     fs->dnVecDescr_X,
1713da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1714da112707SJunchao Zhang                     cusparse_scalartype,
1715da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1716da112707SJunchao Zhang                     fs->spsvDescr_Ut,
1717da112707SJunchao Zhang                     fs->spsvBuffer_Ut));
171812ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_TRUE;
1719da112707SJunchao Zhang   }
1720da112707SJunchao Zhang 
1721da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
1722da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b,&barray));
1723da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
1724da112707SJunchao Zhang 
1725da112707SJunchao Zhang   /* Solve Ut*y = b */
1726da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
1727da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
1728da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1729da112707SJunchao Zhang                     CUSPARSE_OPERATION_TRANSPOSE,
1730da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1731da112707SJunchao Zhang                     fs->spMatDescr_U, /* Ut Y = X */
1732da112707SJunchao Zhang                     fs->dnVecDescr_X,
1733da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1734da112707SJunchao Zhang                     cusparse_scalartype,
1735da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1736da112707SJunchao Zhang                     fs->spsvDescr_Ut));
1737da112707SJunchao Zhang 
1738da112707SJunchao Zhang   /* Solve Lt*x = y */
1739da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
1740da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
1741da112707SJunchao Zhang                     CUSPARSE_OPERATION_TRANSPOSE,
1742da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1743da112707SJunchao Zhang                     fs->spMatDescr_L, /* Lt X = Y */
1744da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1745da112707SJunchao Zhang                     fs->dnVecDescr_X,
1746da112707SJunchao Zhang                     cusparse_scalartype,
1747da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1748da112707SJunchao Zhang                     fs->spsvDescr_Lt));
1749da112707SJunchao Zhang 
1750da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b,&barray));
1751da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
1752da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
1753da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
1754da112707SJunchao Zhang   PetscFunctionReturn(0);
1755da112707SJunchao Zhang }
1756da112707SJunchao Zhang 
1757da112707SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,const MatFactorInfo *info)
1758da112707SJunchao Zhang {
1759da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1760da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1761da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1762da112707SJunchao Zhang   CsrMatrix                    *Acsr;
1763da112707SJunchao Zhang   PetscInt                     m,nz;
1764da112707SJunchao Zhang   PetscBool                    flg;
1765da112707SJunchao Zhang 
1766da112707SJunchao Zhang   PetscFunctionBegin;
1767da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1768da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1769da112707SJunchao Zhang     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1770da112707SJunchao Zhang   }
1771da112707SJunchao Zhang 
1772da112707SJunchao Zhang   /* Copy A's value to fact */
1773da112707SJunchao Zhang   m  = fact->rmap->n;
1774da112707SJunchao Zhang   nz = aij->nz;
1775da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1776da112707SJunchao Zhang   Acsr = (CsrMatrix*)Acusp->mat->mat;
1777da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1778da112707SJunchao Zhang 
1779da112707SJunchao Zhang   /* Factorize fact inplace */
1780da112707SJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1781da112707SJunchao Zhang                     fs->matDescr_M,
1782da112707SJunchao Zhang                     fs->csrVal,
1783da112707SJunchao Zhang                     fs->csrRowPtr,
1784da112707SJunchao Zhang                     fs->csrColIdx,
1785da112707SJunchao Zhang                     fs->ilu0Info_M,
1786da112707SJunchao Zhang                     fs->policy_M,
1787da112707SJunchao Zhang                     fs->factBuffer_M));
1788da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1789da112707SJunchao Zhang     int              numerical_zero;
1790da112707SJunchao Zhang     cusparseStatus_t status;
1791da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero);
1792da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csrilu02: A(%d,%d) is zero",numerical_zero,numerical_zero);
1793da112707SJunchao Zhang   }
1794da112707SJunchao Zhang 
179512ba2bc6SJunchao Zhang   /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02()
179612ba2bc6SJunchao Zhang      See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78
179712ba2bc6SJunchao Zhang   */
1798da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1799da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1800da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1801da112707SJunchao Zhang                     fs->spMatDescr_L,
1802da112707SJunchao Zhang                     fs->dnVecDescr_X,
1803da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1804da112707SJunchao Zhang                     cusparse_scalartype,
1805da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1806da112707SJunchao Zhang                     fs->spsvDescr_L,
1807da112707SJunchao Zhang                     fs->spsvBuffer_L));
1808da112707SJunchao Zhang 
1809da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
1810da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1811da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1812da112707SJunchao Zhang                     fs->spMatDescr_U,
1813da112707SJunchao Zhang                     fs->dnVecDescr_X,
1814da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1815da112707SJunchao Zhang                     cusparse_scalartype,
1816da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1817da112707SJunchao Zhang                     fs->spsvDescr_U,
1818da112707SJunchao Zhang                     fs->spsvBuffer_U));
1819da112707SJunchao Zhang 
182012ba2bc6SJunchao Zhang   /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */
182112ba2bc6SJunchao Zhang   fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
182212ba2bc6SJunchao Zhang 
1823da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
1824da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ILU0;
1825da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolveTranspose_SeqAIJCUSPARSE_ILU0;
1826da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
1827da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
1828da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
1829da112707SJunchao Zhang   PetscFunctionReturn(0);
1830da112707SJunchao Zhang }
1831da112707SJunchao Zhang 
1832da112707SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
1833da112707SJunchao Zhang {
1834da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
1835da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
1836da112707SJunchao Zhang   PetscInt                     m,nz;
1837da112707SJunchao Zhang 
1838da112707SJunchao Zhang   PetscFunctionBegin;
1839da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
1840da112707SJunchao Zhang     PetscInt  i;
1841da112707SJunchao Zhang     PetscBool flg,missing;
1842da112707SJunchao Zhang 
1843da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1844da112707SJunchao Zhang     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
1845da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
1846da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A,&missing,&i));
1847da112707SJunchao Zhang     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
1848da112707SJunchao Zhang   }
1849da112707SJunchao Zhang 
1850da112707SJunchao Zhang   /* Free the old stale stuff */
1851da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
1852da112707SJunchao Zhang 
1853da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
1854da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
1855da112707SJunchao Zhang    */
1856da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
1857da112707SJunchao Zhang 
1858da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
1859da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ILU;
1860da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
1861da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
1862da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
1863da112707SJunchao Zhang 
1864da112707SJunchao Zhang   aij->row = NULL;
1865da112707SJunchao Zhang   aij->col = NULL;
1866da112707SJunchao Zhang 
1867da112707SJunchao Zhang   /* ====================================================================== */
1868da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
1869da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
1870da112707SJunchao Zhang   /* ====================================================================== */
1871da112707SJunchao Zhang   const int *Ai,*Aj;
1872da112707SJunchao Zhang 
1873da112707SJunchao Zhang   m  = fact->rmap->n;
1874da112707SJunchao Zhang   nz = aij->nz;
1875da112707SJunchao Zhang 
1876da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
1877da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
1878da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
1879da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
1880da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1881da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
1882da112707SJunchao Zhang 
1883da112707SJunchao Zhang   /* ====================================================================== */
1884da112707SJunchao Zhang   /* Create descriptors for M, L, U                                         */
1885da112707SJunchao Zhang   /* ====================================================================== */
1886da112707SJunchao Zhang   cusparseFillMode_t fillMode;
1887da112707SJunchao Zhang   cusparseDiagType_t diagType;
1888da112707SJunchao Zhang 
1889da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
1890da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
1891da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
1892da112707SJunchao Zhang 
1893da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
1894da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
1895da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
1896da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
1897da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
1898da112707SJunchao Zhang   */
1899da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
1900da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_UNIT;
1901da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
1902da112707SJunchao Zhang                     fs->csrRowPtr,
1903da112707SJunchao Zhang                     fs->csrColIdx,
1904da112707SJunchao Zhang                     fs->csrVal,
1905da112707SJunchao Zhang                     CUSPARSE_INDEX_32I,
1906da112707SJunchao Zhang                     CUSPARSE_INDEX_32I,
1907da112707SJunchao Zhang                     CUSPARSE_INDEX_BASE_ZERO,
1908da112707SJunchao Zhang                     cusparse_scalartype));
1909da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1910da112707SJunchao Zhang                     CUSPARSE_SPMAT_FILL_MODE,
1911da112707SJunchao Zhang                     &fillMode,
1912da112707SJunchao Zhang                     sizeof(fillMode)));
1913da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
1914da112707SJunchao Zhang                     CUSPARSE_SPMAT_DIAG_TYPE,
1915da112707SJunchao Zhang                     &diagType,
1916da112707SJunchao Zhang                     sizeof(diagType)));
1917da112707SJunchao Zhang 
1918da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_UPPER;
1919da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
1920da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U,m,m,nz,
1921da112707SJunchao Zhang                     fs->csrRowPtr,
1922da112707SJunchao Zhang                     fs->csrColIdx,
1923da112707SJunchao Zhang                     fs->csrVal,
1924da112707SJunchao Zhang                     CUSPARSE_INDEX_32I,
1925da112707SJunchao Zhang                     CUSPARSE_INDEX_32I,
1926da112707SJunchao Zhang                     CUSPARSE_INDEX_BASE_ZERO,
1927da112707SJunchao Zhang                     cusparse_scalartype));
1928da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1929da112707SJunchao Zhang                     CUSPARSE_SPMAT_FILL_MODE,
1930da112707SJunchao Zhang                     &fillMode,
1931da112707SJunchao Zhang                     sizeof(fillMode)));
1932da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U,
1933da112707SJunchao Zhang                     CUSPARSE_SPMAT_DIAG_TYPE,
1934da112707SJunchao Zhang                     &diagType,
1935da112707SJunchao Zhang                     sizeof(diagType)));
1936da112707SJunchao Zhang 
1937da112707SJunchao Zhang   /* ========================================================================= */
1938da112707SJunchao Zhang   /* Query buffer sizes for csrilu0, SpSV and allocate buffers                 */
1939da112707SJunchao Zhang   /* ========================================================================= */
1940da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M));
1941da112707SJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
1942da112707SJunchao Zhang                     fs->matDescr_M,
1943da112707SJunchao Zhang                     fs->csrVal,
1944da112707SJunchao Zhang                     fs->csrRowPtr,
1945da112707SJunchao Zhang                     fs->csrColIdx,
1946da112707SJunchao Zhang                     fs->ilu0Info_M,
1947da112707SJunchao Zhang                     &fs->factBufferSize_M));
1948da112707SJunchao Zhang 
1949da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
1950da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
1951da112707SJunchao Zhang 
1952da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
1953da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
1954da112707SJunchao Zhang 
1955da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
1956da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1957da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1958da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1959da112707SJunchao Zhang                     fs->spMatDescr_L,
1960da112707SJunchao Zhang                     fs->dnVecDescr_X,
1961da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1962da112707SJunchao Zhang                     cusparse_scalartype,
1963da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1964da112707SJunchao Zhang                     fs->spsvDescr_L,
1965da112707SJunchao Zhang                     &fs->spsvBufferSize_L));
1966da112707SJunchao Zhang 
1967da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U));
1968da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
1969da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
1970da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
1971da112707SJunchao Zhang                     fs->spMatDescr_U,
1972da112707SJunchao Zhang                     fs->dnVecDescr_X,
1973da112707SJunchao Zhang                     fs->dnVecDescr_Y,
1974da112707SJunchao Zhang                     cusparse_scalartype,
1975da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
1976da112707SJunchao Zhang                     fs->spsvDescr_U,
1977da112707SJunchao Zhang                     &fs->spsvBufferSize_U));
1978da112707SJunchao Zhang 
1979da112707SJunchao Zhang   /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab,
198012ba2bc6SJunchao Zhang      and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77,
198112ba2bc6SJunchao Zhang      spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U.
198212ba2bc6SJunchao Zhang      To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U.
1983da112707SJunchao Zhang    */
198412ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) {
198512ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
198612ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
1987da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_U,fs->spsvBufferSize_U));
198812ba2bc6SJunchao Zhang   } else {
198912ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_U,(size_t)fs->factBufferSize_M)));
199012ba2bc6SJunchao Zhang     fs->spsvBuffer_U = fs->factBuffer_M;
1991da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
199212ba2bc6SJunchao Zhang   }
1993da112707SJunchao Zhang 
1994da112707SJunchao Zhang   /* ========================================================================== */
1995da112707SJunchao Zhang   /* Perform analysis of ilu0 on M, SpSv on L and U                             */
1996da112707SJunchao Zhang   /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/
1997da112707SJunchao Zhang   /* ========================================================================== */
1998da112707SJunchao Zhang   int              structural_zero;
1999da112707SJunchao Zhang   cusparseStatus_t status;
2000da112707SJunchao Zhang 
2001da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2002da112707SJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */
2003da112707SJunchao Zhang                     fs->matDescr_M,
2004da112707SJunchao Zhang                     fs->csrVal,
2005da112707SJunchao Zhang                     fs->csrRowPtr,
2006da112707SJunchao Zhang                     fs->csrColIdx,
2007da112707SJunchao Zhang                     fs->ilu0Info_M,
2008da112707SJunchao Zhang                     fs->policy_M,
2009da112707SJunchao Zhang                     fs->factBuffer_M));
2010da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2011da112707SJunchao Zhang     /* Function cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2012da112707SJunchao Zhang     status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero);
2013da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csrilu02: A(%d,%d) is missing",structural_zero,structural_zero);
2014da112707SJunchao Zhang   }
2015da112707SJunchao Zhang 
2016da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
20170dd8c0acSJunchao Zhang  {
2018da112707SJunchao Zhang     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
20190dd8c0acSJunchao Zhang     PetscInt       *Ai,*Adiag,nzRow,nzLeft;
2020da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2021da112707SJunchao Zhang 
2022da112707SJunchao Zhang     PetscCall(MatMarkDiagonal_SeqAIJ(A));
2023da112707SJunchao Zhang     Ai    = Aseq->i;
2024da112707SJunchao Zhang     Adiag = Aseq->diag;
2025da112707SJunchao Zhang     for (PetscInt i=0; i<m; i++) {
2026da112707SJunchao Zhang       if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i+1]) { /* There are nonzeros left to the diagonal of row i */
2027da112707SJunchao Zhang         nzRow  = Ai[i+1] - Ai[i];
2028da112707SJunchao Zhang         nzLeft = Adiag[i] - Ai[i];
2029da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2030da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2031da112707SJunchao Zhang         */
2032da112707SJunchao Zhang         nzLeft = (nzRow-1)/2;
2033da112707SJunchao Zhang         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2034da112707SJunchao Zhang       }
2035da112707SJunchao Zhang     }
2036da112707SJunchao Zhang     fs->numericFactFlops = flops;
20370dd8c0acSJunchao Zhang   }
2038da112707SJunchao Zhang   fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0;
2039da112707SJunchao Zhang   PetscFunctionReturn(0);
2040da112707SJunchao Zhang }
2041da112707SJunchao Zhang 
2042da112707SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact,Vec b,Vec x)
2043da112707SJunchao Zhang {
2044da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors  *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2045da112707SJunchao Zhang   Mat_SeqAIJ                    *aij = (Mat_SeqAIJ*)fact->data;
2046da112707SJunchao Zhang   const PetscScalar             *barray;
2047da112707SJunchao Zhang   PetscScalar                   *xarray;
2048da112707SJunchao Zhang 
2049da112707SJunchao Zhang   PetscFunctionBegin;
2050da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayWrite(x,&xarray));
2051da112707SJunchao Zhang   PetscCall(VecCUDAGetArrayRead(b,&barray));
2052da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
2053da112707SJunchao Zhang 
2054da112707SJunchao Zhang   /* Solve L*y = b */
2055da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,(void*)barray));
2056da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y,fs->Y));
2057da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2058da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2059da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
2060da112707SJunchao Zhang                     fs->spMatDescr_L, /* L Y = X */
2061da112707SJunchao Zhang                     fs->dnVecDescr_X,
2062da112707SJunchao Zhang                     fs->dnVecDescr_Y,
2063da112707SJunchao Zhang                     cusparse_scalartype,
2064da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
2065da112707SJunchao Zhang                     fs->spsvDescr_L));
2066da112707SJunchao Zhang 
2067da112707SJunchao Zhang   /* Solve Lt*x = y */
2068da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X,xarray));
2069da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle,
2070da112707SJunchao Zhang                     CUSPARSE_OPERATION_TRANSPOSE,
2071da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
2072da112707SJunchao Zhang                     fs->spMatDescr_L, /* Lt X = Y */
2073da112707SJunchao Zhang                     fs->dnVecDescr_Y,
2074da112707SJunchao Zhang                     fs->dnVecDescr_X,
2075da112707SJunchao Zhang                     cusparse_scalartype,
2076da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
2077da112707SJunchao Zhang                     fs->spsvDescr_Lt));
2078da112707SJunchao Zhang 
2079da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayRead(b,&barray));
2080da112707SJunchao Zhang   PetscCall(VecCUDARestoreArrayWrite(x,&xarray));
2081da112707SJunchao Zhang 
2082da112707SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
2083da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(2.0*aij->nz - fact->rmap->n));
2084da112707SJunchao Zhang   PetscFunctionReturn(0);
2085da112707SJunchao Zhang }
2086da112707SJunchao Zhang 
2087da112707SJunchao Zhang static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,const MatFactorInfo *info)
2088da112707SJunchao Zhang {
2089da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2090da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2091da112707SJunchao Zhang   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2092da112707SJunchao Zhang   CsrMatrix                    *Acsr;
2093da112707SJunchao Zhang   PetscInt                     m,nz;
2094da112707SJunchao Zhang   PetscBool                    flg;
2095da112707SJunchao Zhang 
2096da112707SJunchao Zhang   PetscFunctionBegin;
2097da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2098da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2099da112707SJunchao Zhang     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2100da112707SJunchao Zhang   }
2101da112707SJunchao Zhang 
2102da112707SJunchao Zhang   /* Copy A's value to fact */
2103da112707SJunchao Zhang   m  = fact->rmap->n;
2104da112707SJunchao Zhang   nz = aij->nz;
2105da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2106da112707SJunchao Zhang   Acsr = (CsrMatrix*)Acusp->mat->mat;
2107da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrVal,Acsr->values->data().get(),sizeof(PetscScalar)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2108da112707SJunchao Zhang 
2109da112707SJunchao Zhang   /* Factorize fact inplace */
2110da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve
2111da112707SJunchao Zhang      Function csric02() only takes the lower triangular part of matrix A to perform factorization.
2112da112707SJunchao Zhang      The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored,
2113da112707SJunchao Zhang      and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not.
2114da112707SJunchao Zhang      In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided.
2115da112707SJunchao Zhang    */
2116da112707SJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz,
2117da112707SJunchao Zhang                     fs->matDescr_M,
2118da112707SJunchao Zhang                     fs->csrVal,
2119da112707SJunchao Zhang                     fs->csrRowPtr,
2120da112707SJunchao Zhang                     fs->csrColIdx,
2121da112707SJunchao Zhang                     fs->ic0Info_M,
2122da112707SJunchao Zhang                     fs->policy_M,
2123da112707SJunchao Zhang                     fs->factBuffer_M));
2124da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2125da112707SJunchao Zhang     int              numerical_zero;
2126da112707SJunchao Zhang     cusparseStatus_t status;
2127da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero);
2128da112707SJunchao Zhang     PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Numerical zero pivot detected in csric02: A(%d,%d) is zero",numerical_zero,numerical_zero);
2129da112707SJunchao Zhang   }
2130da112707SJunchao Zhang 
2131da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2132da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2133da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
2134da112707SJunchao Zhang                     fs->spMatDescr_L,
2135da112707SJunchao Zhang                     fs->dnVecDescr_X,
2136da112707SJunchao Zhang                     fs->dnVecDescr_Y,
2137da112707SJunchao Zhang                     cusparse_scalartype,
2138da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
2139da112707SJunchao Zhang                     fs->spsvDescr_L,
2140da112707SJunchao Zhang                     fs->spsvBuffer_L));
2141da112707SJunchao Zhang 
2142da112707SJunchao Zhang   /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
2143da112707SJunchao Zhang     ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F
2144da112707SJunchao Zhang   */
2145da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle,
2146da112707SJunchao Zhang                     CUSPARSE_OPERATION_TRANSPOSE,
2147da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
2148da112707SJunchao Zhang                     fs->spMatDescr_L,
2149da112707SJunchao Zhang                     fs->dnVecDescr_X,
2150da112707SJunchao Zhang                     fs->dnVecDescr_Y,
2151da112707SJunchao Zhang                     cusparse_scalartype,
2152da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
2153da112707SJunchao Zhang                     fs->spsvDescr_Lt,
2154da112707SJunchao Zhang                     fs->spsvBuffer_Lt));
2155da112707SJunchao Zhang 
2156da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_GPU;
2157da112707SJunchao Zhang   fact->ops->solve             = MatSolve_SeqAIJCUSPARSE_ICC0;
2158da112707SJunchao Zhang   fact->ops->solvetranspose    = MatSolve_SeqAIJCUSPARSE_ICC0;
2159da112707SJunchao Zhang   fact->ops->matsolve          = NULL;
2160da112707SJunchao Zhang   fact->ops->matsolvetranspose = NULL;
2161da112707SJunchao Zhang   PetscCall(PetscLogGpuFlops(fs->numericFactFlops));
2162da112707SJunchao Zhang   PetscFunctionReturn(0);
2163da112707SJunchao Zhang }
2164da112707SJunchao Zhang 
2165da112707SJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2166da112707SJunchao Zhang {
2167da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)fact->spptr;
2168da112707SJunchao Zhang   Mat_SeqAIJ                   *aij = (Mat_SeqAIJ*)fact->data;
2169da112707SJunchao Zhang   PetscInt                     m,nz;
2170da112707SJunchao Zhang 
2171da112707SJunchao Zhang   PetscFunctionBegin;
2172da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2173da112707SJunchao Zhang     PetscInt  i;
2174da112707SJunchao Zhang     PetscBool flg,missing;
2175da112707SJunchao Zhang 
2176da112707SJunchao Zhang     PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2177da112707SJunchao Zhang     PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Expected MATSEQAIJCUSPARSE, but input is %s",((PetscObject)A)->type_name);
2178da112707SJunchao Zhang     PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
2179da112707SJunchao Zhang     PetscCall(MatMissingDiagonal(A,&missing,&i));
2180da112707SJunchao Zhang     PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,i);
2181da112707SJunchao Zhang   }
2182da112707SJunchao Zhang 
2183da112707SJunchao Zhang   /* Free the old stale stuff */
2184da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs));
2185da112707SJunchao Zhang 
2186da112707SJunchao Zhang   /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host,
2187da112707SJunchao Zhang      but they will not be used. Allocate them just for easy debugging.
2188da112707SJunchao Zhang    */
2189da112707SJunchao Zhang   PetscCall(MatDuplicateNoCreate_SeqAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE/*malloc*/));
2190da112707SJunchao Zhang 
2191da112707SJunchao Zhang   fact->offloadmask            = PETSC_OFFLOAD_BOTH;
2192da112707SJunchao Zhang   fact->factortype             = MAT_FACTOR_ICC;
2193da112707SJunchao Zhang   fact->info.factor_mallocs    = 0;
2194da112707SJunchao Zhang   fact->info.fill_ratio_given  = info->fill;
2195da112707SJunchao Zhang   fact->info.fill_ratio_needed = 1.0;
2196da112707SJunchao Zhang 
2197da112707SJunchao Zhang   aij->row = NULL;
2198da112707SJunchao Zhang   aij->col = NULL;
2199da112707SJunchao Zhang 
2200da112707SJunchao Zhang   /* ====================================================================== */
2201da112707SJunchao Zhang   /* Copy A's i, j to fact and also allocate the value array of fact.       */
2202da112707SJunchao Zhang   /* We'll do in-place factorization on fact                                */
2203da112707SJunchao Zhang   /* ====================================================================== */
2204da112707SJunchao Zhang   const int *Ai,*Aj;
2205da112707SJunchao Zhang 
2206da112707SJunchao Zhang   m  = fact->rmap->n;
2207da112707SJunchao Zhang   nz = aij->nz;
2208da112707SJunchao Zhang 
2209da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->csrRowPtr,sizeof(int)*(m+1)));
2210da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->csrColIdx,sizeof(int)*nz));
2211da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->csrVal,sizeof(PetscScalar)*nz));
2212da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSEGetIJ(A,PETSC_FALSE,&Ai,&Aj));  /* Do not use compressed Ai */
2213da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr,Ai,sizeof(int)*(m+1),cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2214da112707SJunchao Zhang   PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx,Aj,sizeof(int)*nz,cudaMemcpyDeviceToDevice,PetscDefaultCudaStream));
2215da112707SJunchao Zhang 
2216da112707SJunchao Zhang   /* ====================================================================== */
2217da112707SJunchao Zhang   /* Create mat descriptors for M, L                                        */
2218da112707SJunchao Zhang   /* ====================================================================== */
2219da112707SJunchao Zhang   cusparseFillMode_t fillMode;
2220da112707SJunchao Zhang   cusparseDiagType_t diagType;
2221da112707SJunchao Zhang 
2222da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M));
2223da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO));
2224da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL));
2225da112707SJunchao Zhang 
2226da112707SJunchao Zhang   /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t
2227da112707SJunchao Zhang     cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always
2228da112707SJunchao Zhang     assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that
2229da112707SJunchao Zhang     all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine
2230da112707SJunchao Zhang     assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory.
2231da112707SJunchao Zhang   */
2232da112707SJunchao Zhang   fillMode = CUSPARSE_FILL_MODE_LOWER;
2233da112707SJunchao Zhang   diagType = CUSPARSE_DIAG_TYPE_NON_UNIT;
2234da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L,m,m,nz,
2235da112707SJunchao Zhang                     fs->csrRowPtr,
2236da112707SJunchao Zhang                     fs->csrColIdx,
2237da112707SJunchao Zhang                     fs->csrVal,
2238da112707SJunchao Zhang                     CUSPARSE_INDEX_32I,
2239da112707SJunchao Zhang                     CUSPARSE_INDEX_32I,
2240da112707SJunchao Zhang                     CUSPARSE_INDEX_BASE_ZERO,
2241da112707SJunchao Zhang                     cusparse_scalartype));
2242da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2243da112707SJunchao Zhang                     CUSPARSE_SPMAT_FILL_MODE,
2244da112707SJunchao Zhang                     &fillMode,
2245da112707SJunchao Zhang                     sizeof(fillMode)));
2246da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L,
2247da112707SJunchao Zhang                     CUSPARSE_SPMAT_DIAG_TYPE,
2248da112707SJunchao Zhang                     &diagType,
2249da112707SJunchao Zhang                     sizeof(diagType)));
2250da112707SJunchao Zhang 
2251da112707SJunchao Zhang   /* ========================================================================= */
2252da112707SJunchao Zhang   /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers     */
2253da112707SJunchao Zhang   /* ========================================================================= */
2254da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M));
2255da112707SJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz,
2256da112707SJunchao Zhang                     fs->matDescr_M,
2257da112707SJunchao Zhang                     fs->csrVal,
2258da112707SJunchao Zhang                     fs->csrRowPtr,
2259da112707SJunchao Zhang                     fs->csrColIdx,
2260da112707SJunchao Zhang                     fs->ic0Info_M,
2261da112707SJunchao Zhang                     &fs->factBufferSize_M));
2262da112707SJunchao Zhang 
2263da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->X,sizeof(PetscScalar)*m));
2264da112707SJunchao Zhang   PetscCallCUDA(cudaMalloc((void**)&fs->Y,sizeof(PetscScalar)*m));
2265da112707SJunchao Zhang 
2266da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X,m,fs->X,cusparse_scalartype));
2267da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y,m,fs->Y,cusparse_scalartype));
2268da112707SJunchao Zhang 
2269da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L));
2270da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2271da112707SJunchao Zhang                     CUSPARSE_OPERATION_NON_TRANSPOSE,
2272da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
2273da112707SJunchao Zhang                     fs->spMatDescr_L,
2274da112707SJunchao Zhang                     fs->dnVecDescr_X,
2275da112707SJunchao Zhang                     fs->dnVecDescr_Y,
2276da112707SJunchao Zhang                     cusparse_scalartype,
2277da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
2278da112707SJunchao Zhang                     fs->spsvDescr_L,
2279da112707SJunchao Zhang                     &fs->spsvBufferSize_L));
2280da112707SJunchao Zhang 
2281da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt));
2282da112707SJunchao Zhang   PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle,
2283da112707SJunchao Zhang                     CUSPARSE_OPERATION_TRANSPOSE,
2284da112707SJunchao Zhang                     &PETSC_CUSPARSE_ONE,
2285da112707SJunchao Zhang                     fs->spMatDescr_L,
2286da112707SJunchao Zhang                     fs->dnVecDescr_X,
2287da112707SJunchao Zhang                     fs->dnVecDescr_Y,
2288da112707SJunchao Zhang                     cusparse_scalartype,
2289da112707SJunchao Zhang                     CUSPARSE_SPSV_ALG_DEFAULT,
2290da112707SJunchao Zhang                     fs->spsvDescr_Lt,
2291da112707SJunchao Zhang                     &fs->spsvBufferSize_Lt));
2292da112707SJunchao Zhang 
229312ba2bc6SJunchao Zhang   /* To save device memory, we make the factorization buffer share with one of the solver buffer.
229412ba2bc6SJunchao Zhang      See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0().
229512ba2bc6SJunchao Zhang    */
229612ba2bc6SJunchao Zhang   if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) {
229712ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_L,(size_t)fs->factBufferSize_M)));
229812ba2bc6SJunchao Zhang     fs->spsvBuffer_L = fs->factBuffer_M;
2299da112707SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_Lt,fs->spsvBufferSize_Lt));
230012ba2bc6SJunchao Zhang   } else {
230112ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->factBuffer_M,PetscMax(fs->spsvBufferSize_Lt,(size_t)fs->factBufferSize_M)));
230212ba2bc6SJunchao Zhang     fs->spsvBuffer_Lt = fs->factBuffer_M;
230312ba2bc6SJunchao Zhang     PetscCallCUDA(cudaMalloc((void**)&fs->spsvBuffer_L,fs->spsvBufferSize_L));
230412ba2bc6SJunchao Zhang   }
2305da112707SJunchao Zhang 
2306da112707SJunchao Zhang   /* ========================================================================== */
2307da112707SJunchao Zhang   /* Perform analysis of ic0 on M                                               */
2308da112707SJunchao Zhang   /* The lower triangular part of M has the same sparsity pattern as L          */
2309da112707SJunchao Zhang   /* ========================================================================== */
2310da112707SJunchao Zhang   int              structural_zero;
2311da112707SJunchao Zhang   cusparseStatus_t status;
2312da112707SJunchao Zhang 
2313da112707SJunchao Zhang   fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
2314da112707SJunchao Zhang   if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz,
2315da112707SJunchao Zhang                     fs->matDescr_M,
2316da112707SJunchao Zhang                     fs->csrVal,
2317da112707SJunchao Zhang                     fs->csrRowPtr,
2318da112707SJunchao Zhang                     fs->csrColIdx,
2319da112707SJunchao Zhang                     fs->ic0Info_M,
2320da112707SJunchao Zhang                     fs->policy_M,
2321da112707SJunchao Zhang                     fs->factBuffer_M));
2322da112707SJunchao Zhang   if (PetscDefined(USE_DEBUG)) {
2323da112707SJunchao Zhang     /* Function cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */
2324da112707SJunchao Zhang     status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero);
2325da112707SJunchao Zhang     PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status,PETSC_COMM_SELF,PETSC_ERR_USER_INPUT,"Structural zero pivot detected in csric02: A(%d,%d) is missing",structural_zero,structural_zero);
2326da112707SJunchao Zhang   }
2327da112707SJunchao Zhang 
2328da112707SJunchao Zhang   /* Estimate FLOPs of the numeric factorization */
23290dd8c0acSJunchao Zhang   {
2330da112707SJunchao Zhang     Mat_SeqAIJ     *Aseq = (Mat_SeqAIJ*)A->data;
23310dd8c0acSJunchao Zhang     PetscInt       *Ai,nzRow,nzLeft;
2332da112707SJunchao Zhang     PetscLogDouble flops = 0.0;
2333da112707SJunchao Zhang 
2334da112707SJunchao Zhang     Ai = Aseq->i;
2335da112707SJunchao Zhang     for (PetscInt i=0; i<m; i++) {
2336da112707SJunchao Zhang       nzRow = Ai[i+1] - Ai[i];
2337da112707SJunchao Zhang       if (nzRow > 1) {
2338da112707SJunchao Zhang         /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right
2339da112707SJunchao Zhang           and include the eliminated one will be updated, which incurs a multiplication and an addition.
2340da112707SJunchao Zhang         */
2341da112707SJunchao Zhang         nzLeft = (nzRow-1)/2;
2342da112707SJunchao Zhang         flops += nzLeft*(2.0*nzRow-nzLeft+1);
2343da112707SJunchao Zhang       }
2344da112707SJunchao Zhang     }
2345da112707SJunchao Zhang     fs->numericFactFlops = flops;
23460dd8c0acSJunchao Zhang   }
2347da112707SJunchao Zhang   fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0;
2348da112707SJunchao Zhang   PetscFunctionReturn(0);
2349da112707SJunchao Zhang }
2350da112707SJunchao Zhang #endif
2351da112707SJunchao Zhang 
2352da112707SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2353da112707SJunchao Zhang {
2354da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2355da112707SJunchao Zhang 
2356da112707SJunchao Zhang   PetscFunctionBegin;
2357da112707SJunchao Zhang  #if CUSPARSE_VERSION >= 11500
2358*bc996fdcSJunchao Zhang   PetscBool row_identity = PETSC_FALSE,col_identity = PETSC_FALSE;
2359*bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) {
2360da112707SJunchao Zhang     PetscCall(ISIdentity(isrow,&row_identity));
2361da112707SJunchao Zhang     PetscCall(ISIdentity(iscol,&col_identity));
2362*bc996fdcSJunchao Zhang   }
2363da112707SJunchao Zhang   if (!info->levels && row_identity && col_identity) {
2364da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B,A,isrow,iscol,info));
2365da112707SJunchao Zhang   } else
2366da112707SJunchao Zhang  #endif
2367da112707SJunchao Zhang   {
2368da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2369da112707SJunchao Zhang     PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2370da112707SJunchao Zhang     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2371da112707SJunchao Zhang   }
2372da112707SJunchao Zhang   PetscFunctionReturn(0);
2373da112707SJunchao Zhang }
2374da112707SJunchao Zhang 
2375da112707SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2376da112707SJunchao Zhang {
2377da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2378da112707SJunchao Zhang 
2379da112707SJunchao Zhang   PetscFunctionBegin;
2380da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2381da112707SJunchao Zhang   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
2382da112707SJunchao Zhang   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2383da112707SJunchao Zhang   PetscFunctionReturn(0);
2384da112707SJunchao Zhang }
2385da112707SJunchao Zhang 
2386da112707SJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2387da112707SJunchao Zhang {
2388da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2389da112707SJunchao Zhang 
2390da112707SJunchao Zhang   PetscFunctionBegin;
2391da112707SJunchao Zhang  #if CUSPARSE_VERSION >= 11500
2392*bc996fdcSJunchao Zhang   PetscBool perm_identity = PETSC_FALSE;
2393*bc996fdcSJunchao Zhang   if (cusparseTriFactors->factorizeOnDevice) PetscCall(ISIdentity(perm,&perm_identity));
2394da112707SJunchao Zhang   if (!info->levels && perm_identity) {
2395da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B,A,perm,info));
2396da112707SJunchao Zhang   } else
2397da112707SJunchao Zhang  #endif
2398da112707SJunchao Zhang   {
2399da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2400da112707SJunchao Zhang     PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
2401da112707SJunchao Zhang     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2402da112707SJunchao Zhang   }
2403da112707SJunchao Zhang   PetscFunctionReturn(0);
2404da112707SJunchao Zhang }
2405da112707SJunchao Zhang 
2406da112707SJunchao Zhang static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2407da112707SJunchao Zhang {
2408da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2409da112707SJunchao Zhang 
2410da112707SJunchao Zhang   PetscFunctionBegin;
2411da112707SJunchao Zhang   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
2412da112707SJunchao Zhang   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
2413da112707SJunchao Zhang   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
2414da112707SJunchao Zhang   PetscFunctionReturn(0);
2415da112707SJunchao Zhang }
2416da112707SJunchao Zhang 
2417841d4cb1SJunchao Zhang PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
2418841d4cb1SJunchao Zhang {
2419841d4cb1SJunchao Zhang   PetscFunctionBegin;
2420841d4cb1SJunchao Zhang   *type = MATSOLVERCUSPARSE;
2421841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2422841d4cb1SJunchao Zhang }
2423841d4cb1SJunchao Zhang 
2424841d4cb1SJunchao Zhang /*MC
2425841d4cb1SJunchao Zhang   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
2426841d4cb1SJunchao Zhang   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
2427841d4cb1SJunchao Zhang   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
2428841d4cb1SJunchao Zhang   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
2429841d4cb1SJunchao Zhang   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
2430841d4cb1SJunchao Zhang   algorithms are not recommended. This class does NOT support direct solver operations.
2431841d4cb1SJunchao Zhang 
2432841d4cb1SJunchao Zhang   Level: beginner
2433841d4cb1SJunchao Zhang 
2434841d4cb1SJunchao Zhang .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
2435841d4cb1SJunchao Zhang M*/
2436841d4cb1SJunchao Zhang 
2437841d4cb1SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
2438841d4cb1SJunchao Zhang {
2439841d4cb1SJunchao Zhang   PetscInt       n = A->rmap->n;
2440*bc996fdcSJunchao Zhang   PetscBool      factOnDevice,factOnHost;
2441*bc996fdcSJunchao Zhang   char           *prefix;
2442*bc996fdcSJunchao Zhang   char           factPlace[32] = "device"; /* the default */
2443841d4cb1SJunchao Zhang 
2444841d4cb1SJunchao Zhang   PetscFunctionBegin;
2445841d4cb1SJunchao Zhang   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
2446841d4cb1SJunchao Zhang   PetscCall(MatSetSizes(*B,n,n,n,n));
2447841d4cb1SJunchao Zhang   (*B)->factortype = ftype;
2448841d4cb1SJunchao Zhang   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
2449841d4cb1SJunchao Zhang 
2450*bc996fdcSJunchao Zhang   prefix = (*B)->factorprefix ? (*B)->factorprefix : ((PetscObject)A)->prefix;
2451*bc996fdcSJunchao Zhang   PetscOptionsBegin(PetscObjectComm((PetscObject)(*B)),prefix,"MatGetFactor","Mat");
2452*bc996fdcSJunchao Zhang   PetscCall(PetscOptionsString("-mat_factor_bind_factorization","Do matrix factorization on host or device when possible","MatGetFactor",NULL,factPlace,sizeof(factPlace),NULL));
2453*bc996fdcSJunchao Zhang   PetscOptionsEnd();
2454*bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("device",factPlace,&factOnDevice));
2455*bc996fdcSJunchao Zhang   PetscCall(PetscStrcasecmp("host",factPlace,&factOnHost));
2456*bc996fdcSJunchao Zhang   PetscCheck(factOnDevice || factOnHost,PetscObjectComm((PetscObject)(*B)),PETSC_ERR_ARG_OUTOFRANGE,"Wrong option %s to -mat_factor_bind_factorization <string>. Only host and device are allowed",factPlace);
2457*bc996fdcSJunchao Zhang   ((Mat_SeqAIJCUSPARSETriFactors*)(*B)->spptr)->factorizeOnDevice = factOnDevice;
2458*bc996fdcSJunchao Zhang 
2459841d4cb1SJunchao Zhang   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
2460841d4cb1SJunchao Zhang   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2461841d4cb1SJunchao Zhang     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
2462841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2463841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
2464841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
2465841d4cb1SJunchao Zhang     } else {
2466841d4cb1SJunchao Zhang       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
2467841d4cb1SJunchao Zhang       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
2468841d4cb1SJunchao Zhang     }
2469841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
2470841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2471841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2472841d4cb1SJunchao Zhang   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2473841d4cb1SJunchao Zhang     if (!A->boundtocpu) {
2474841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
2475841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
2476841d4cb1SJunchao Zhang     } else {
2477841d4cb1SJunchao Zhang       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
2478841d4cb1SJunchao Zhang       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
2479841d4cb1SJunchao Zhang     }
2480841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2481841d4cb1SJunchao Zhang     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2482841d4cb1SJunchao Zhang   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
2483841d4cb1SJunchao Zhang 
2484841d4cb1SJunchao Zhang   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
2485841d4cb1SJunchao Zhang   (*B)->canuseordering = PETSC_TRUE;
2486841d4cb1SJunchao Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
2487841d4cb1SJunchao Zhang   PetscFunctionReturn(0);
2488841d4cb1SJunchao Zhang }
2489841d4cb1SJunchao Zhang 
24907e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
24917e8381f9SStefano Zampini {
24927e8381f9SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
24937e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24940dd8c0acSJunchao Zhang  #if CUSPARSE_VERSION >= 13500
2495da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
24960dd8c0acSJunchao Zhang  #endif
24977e8381f9SStefano Zampini 
24987e8381f9SStefano Zampini   PetscFunctionBegin;
24997e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
25009566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
2501da112707SJunchao Zhang     if (A->factortype == MAT_FACTOR_NONE) {
2502da112707SJunchao Zhang       CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
25039566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2504da112707SJunchao Zhang     }
2505da112707SJunchao Zhang    #if CUSPARSE_VERSION >= 13500
2506da112707SJunchao Zhang     else if (fs->csrVal) {
2507da112707SJunchao Zhang       /* We have a factorized matrix on device and are able to copy it to host */
2508da112707SJunchao Zhang       PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
2509da112707SJunchao Zhang     }
2510da112707SJunchao Zhang    #endif
2511da112707SJunchao Zhang     else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for copying this type of factorized matrix from device to host");
25129566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
25139566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
25147e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
25157e8381f9SStefano Zampini   }
25167e8381f9SStefano Zampini   PetscFunctionReturn(0);
25177e8381f9SStefano Zampini }
25187e8381f9SStefano Zampini 
25197e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
25207e8381f9SStefano Zampini {
25217e8381f9SStefano Zampini   PetscFunctionBegin;
25229566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
252367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
252467a45760SJunchao Zhang   PetscFunctionReturn(0);
252567a45760SJunchao Zhang }
252667a45760SJunchao Zhang 
252767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
252867a45760SJunchao Zhang {
252967a45760SJunchao Zhang   PetscFunctionBegin;
25307e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
253167a45760SJunchao Zhang   *array         = NULL;
253267a45760SJunchao Zhang   PetscFunctionReturn(0);
253367a45760SJunchao Zhang }
253467a45760SJunchao Zhang 
253567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
253667a45760SJunchao Zhang {
253767a45760SJunchao Zhang   PetscFunctionBegin;
25389566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
253967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
254067a45760SJunchao Zhang   PetscFunctionReturn(0);
254167a45760SJunchao Zhang }
254267a45760SJunchao Zhang 
254367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
254467a45760SJunchao Zhang {
254567a45760SJunchao Zhang   PetscFunctionBegin;
254667a45760SJunchao Zhang   *array = NULL;
254767a45760SJunchao Zhang   PetscFunctionReturn(0);
254867a45760SJunchao Zhang }
254967a45760SJunchao Zhang 
255067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
255167a45760SJunchao Zhang {
255267a45760SJunchao Zhang   PetscFunctionBegin;
255367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
255467a45760SJunchao Zhang   PetscFunctionReturn(0);
255567a45760SJunchao Zhang }
255667a45760SJunchao Zhang 
255767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
255867a45760SJunchao Zhang {
255967a45760SJunchao Zhang   PetscFunctionBegin;
256067a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
256167a45760SJunchao Zhang   *array         = NULL;
25627e8381f9SStefano Zampini   PetscFunctionReturn(0);
25637e8381f9SStefano Zampini }
25647e8381f9SStefano Zampini 
25657ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
25667ee59b9bSJunchao Zhang {
25677ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE           *cusp;
25687ee59b9bSJunchao Zhang   CsrMatrix                    *matrix;
25697ee59b9bSJunchao Zhang 
25707ee59b9bSJunchao Zhang   PetscFunctionBegin;
25717ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
25727ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
25737ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
25747ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
25757ee59b9bSJunchao Zhang   matrix = (CsrMatrix*)cusp->mat->mat;
25767ee59b9bSJunchao Zhang 
25777ee59b9bSJunchao Zhang   if (i) {
25787ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
25797ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
25807ee59b9bSJunchao Zhang    #else
25817ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
25827ee59b9bSJunchao Zhang    #endif
25837ee59b9bSJunchao Zhang   }
25847ee59b9bSJunchao Zhang   if (j) {
25857ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
25867ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
25877ee59b9bSJunchao Zhang    #else
25887ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
25897ee59b9bSJunchao Zhang    #endif
25907ee59b9bSJunchao Zhang   }
25917ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
25927ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
25937ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
25947ee59b9bSJunchao Zhang }
25957ee59b9bSJunchao Zhang 
2596042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
25979ae82921SPaul Mullowney {
2598aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
25997c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
26009ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2601213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
2602aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2603abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
26049ae82921SPaul Mullowney 
26059ae82921SPaul Mullowney   PetscFunctionBegin;
260628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
2607c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
2608a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
2609a49f1ed0SStefano Zampini       CsrMatrix *matrix;
2610afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
261185ba7357SStefano Zampini 
261208401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
26139566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
2614afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
26159566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
26169566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
26179566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
26189566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
261934d6c7a5SJose E. Roman     } else {
2620abb89eb1SStefano Zampini       PetscInt nnz;
26219566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
26229566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
26239566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
26247c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
262581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
2626a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
2627a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
26289ae82921SPaul Mullowney       try {
26299ae82921SPaul Mullowney         if (a->compressedrow.use) {
26309ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
26319ae82921SPaul Mullowney           ii   = a->compressedrow.i;
26329ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
26339ae82921SPaul Mullowney         } else {
2634213423ffSJunchao Zhang           m    = A->rmap->n;
2635213423ffSJunchao Zhang           ii   = a->i;
2636e6e9a74fSStefano Zampini           ridx = NULL;
26379ae82921SPaul Mullowney         }
263808401ef6SPierre Jolivet         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
2639abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
2640abb89eb1SStefano Zampini         else nnz = a->nz;
264108401ef6SPierre Jolivet         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
26429ae82921SPaul Mullowney 
264385ba7357SStefano Zampini         /* create cusparse matrix */
2644abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
2645aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
26469566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
26479566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
26489566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
26499ae82921SPaul Mullowney 
26509566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
26519566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
26529566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
26539566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
26549566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
26559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
26569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
2657b06137fdSPaul Mullowney 
2658aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
2659aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
2660aa372e3fSPaul Mullowney           /* set the matrix */
2661afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
2662afb2bd1cSJunchao Zhang           mat->num_rows = m;
2663afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2664abb89eb1SStefano Zampini           mat->num_entries = nnz;
2665afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2666afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
26679ae82921SPaul Mullowney 
2668abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2669abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
2670aa372e3fSPaul Mullowney 
2671abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2672abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
2673aa372e3fSPaul Mullowney 
2674aa372e3fSPaul Mullowney           /* assign the pointer */
2675afb2bd1cSJunchao Zhang           matstruct->mat = mat;
2676afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2677afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
2678afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
2679afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
2680afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
2681afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
2682afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
26839566063dSJacob Faibussowitsch                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2684afb2bd1cSJunchao Zhang           }
2685afb2bd1cSJunchao Zhang          #endif
2686aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
2687afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2688afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2689afb2bd1cSJunchao Zhang          #else
2690afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
2691afb2bd1cSJunchao Zhang           mat->num_rows = m;
2692afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
2693abb89eb1SStefano Zampini           mat->num_entries = nnz;
2694afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
2695afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
2696aa372e3fSPaul Mullowney 
2697abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
2698abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
2699aa372e3fSPaul Mullowney 
2700abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
2701abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
2702aa372e3fSPaul Mullowney 
2703aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
27049566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
2705aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
2706aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
2707afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
2708afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
2709afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
2710afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
27119566063dSJacob Faibussowitsch               hybMat, 0, partition);PetscCallCUSPARSE(stat);
2712aa372e3fSPaul Mullowney           /* assign the pointer */
2713aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
2714aa372e3fSPaul Mullowney 
2715afb2bd1cSJunchao Zhang           if (mat) {
2716afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
2717afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
2718afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
2719afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
2720087f3262SPaul Mullowney           }
2721afb2bd1cSJunchao Zhang          #endif
2722087f3262SPaul Mullowney         }
2723ca45077fSPaul Mullowney 
2724aa372e3fSPaul Mullowney         /* assign the compressed row indices */
2725213423ffSJunchao Zhang         if (a->compressedrow.use) {
2726213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2727aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2728aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
2729213423ffSJunchao Zhang           tmp = m;
2730213423ffSJunchao Zhang         } else {
2731213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2732213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2733213423ffSJunchao Zhang           tmp = 0;
2734213423ffSJunchao Zhang         }
27359566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
2736aa372e3fSPaul Mullowney 
2737aa372e3fSPaul Mullowney         /* assign the pointer */
2738aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
27399ae82921SPaul Mullowney       } catch(char *ex) {
274098921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
27419ae82921SPaul Mullowney       }
27429566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
27439566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
274434d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
274534d6c7a5SJose E. Roman     }
2746abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
27479ae82921SPaul Mullowney   }
27489ae82921SPaul Mullowney   PetscFunctionReturn(0);
27499ae82921SPaul Mullowney }
27509ae82921SPaul Mullowney 
2751c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
2752aa372e3fSPaul Mullowney {
2753aa372e3fSPaul Mullowney   template <typename Tuple>
2754aa372e3fSPaul Mullowney   __host__ __device__
2755aa372e3fSPaul Mullowney   void operator()(Tuple t)
2756aa372e3fSPaul Mullowney   {
2757aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2758aa372e3fSPaul Mullowney   }
2759aa372e3fSPaul Mullowney };
2760aa372e3fSPaul Mullowney 
27617e8381f9SStefano Zampini struct VecCUDAEquals
27627e8381f9SStefano Zampini {
27637e8381f9SStefano Zampini   template <typename Tuple>
27647e8381f9SStefano Zampini   __host__ __device__
27657e8381f9SStefano Zampini   void operator()(Tuple t)
27667e8381f9SStefano Zampini   {
27677e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
27687e8381f9SStefano Zampini   }
27697e8381f9SStefano Zampini };
27707e8381f9SStefano Zampini 
2771e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
2772e6e9a74fSStefano Zampini {
2773e6e9a74fSStefano Zampini   template <typename Tuple>
2774e6e9a74fSStefano Zampini   __host__ __device__
2775e6e9a74fSStefano Zampini   void operator()(Tuple t)
2776e6e9a74fSStefano Zampini   {
2777e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2778e6e9a74fSStefano Zampini   }
2779e6e9a74fSStefano Zampini };
2780e6e9a74fSStefano Zampini 
2781afb2bd1cSJunchao Zhang struct MatMatCusparse {
2782ccdfe979SStefano Zampini   PetscBool             cisdense;
2783ccdfe979SStefano Zampini   PetscScalar           *Bt;
2784ccdfe979SStefano Zampini   Mat                   X;
2785fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2786fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2787fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2788b4285af6SJunchao Zhang 
2789afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2790fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2791afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2792afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2793afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2794afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2795b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2796b4285af6SJunchao Zhang   void                  *dBuffer4;
2797b4285af6SJunchao Zhang   void                  *dBuffer5;
2798b4285af6SJunchao Zhang  #endif
2799fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2800fcdce8c4SStefano Zampini   void                  *mmBuffer;
2801fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2802fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2803afb2bd1cSJunchao Zhang #endif
2804afb2bd1cSJunchao Zhang };
2805ccdfe979SStefano Zampini 
2806ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2807ccdfe979SStefano Zampini {
2808ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2809ccdfe979SStefano Zampini 
2810ccdfe979SStefano Zampini   PetscFunctionBegin;
28119566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2812fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2813afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
28149566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
28159566063dSJacob Faibussowitsch   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
28169566063dSJacob Faibussowitsch   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
28179566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2818b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
28199566063dSJacob Faibussowitsch   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
28209566063dSJacob Faibussowitsch   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2821b4285af6SJunchao Zhang  #endif
28229566063dSJacob Faibussowitsch   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
28239566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2824afb2bd1cSJunchao Zhang  #endif
28259566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
28269566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2827ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2828ccdfe979SStefano Zampini }
2829ccdfe979SStefano Zampini 
2830ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2831ccdfe979SStefano Zampini 
2832ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2833ccdfe979SStefano Zampini {
2834ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2835ccdfe979SStefano Zampini   Mat                          A,B;
2836afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2837ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2838ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2839ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2840ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2841ccdfe979SStefano Zampini   const PetscScalar            *barray;
2842ccdfe979SStefano Zampini   PetscScalar                  *carray;
2843ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2844ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2845ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2846ccdfe979SStefano Zampini 
2847ccdfe979SStefano Zampini   PetscFunctionBegin;
2848ccdfe979SStefano Zampini   MatCheckProduct(C,1);
284928b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2850ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2851ccdfe979SStefano Zampini   A    = product->A;
2852ccdfe979SStefano Zampini   B    = product->B;
28539566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
285428b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2855ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2856ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
285728b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
28589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2859ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2860ccdfe979SStefano Zampini   switch (product->type) {
2861ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2862ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2863ccdfe979SStefano Zampini     mat = cusp->mat;
2864ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2865ccdfe979SStefano Zampini     m   = A->rmap->n;
2866ccdfe979SStefano Zampini     n   = B->cmap->n;
2867ccdfe979SStefano Zampini     break;
2868ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
28691a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2870e6e9a74fSStefano Zampini       mat = cusp->mat;
2871e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2872e6e9a74fSStefano Zampini     } else {
28739566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2874ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2875ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2876e6e9a74fSStefano Zampini     }
2877ccdfe979SStefano Zampini     m = A->cmap->n;
2878ccdfe979SStefano Zampini     n = B->cmap->n;
2879ccdfe979SStefano Zampini     break;
2880ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2881ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2882ccdfe979SStefano Zampini     mat = cusp->mat;
2883ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2884ccdfe979SStefano Zampini     m   = A->rmap->n;
2885ccdfe979SStefano Zampini     n   = B->rmap->n;
2886ccdfe979SStefano Zampini     break;
2887ccdfe979SStefano Zampini   default:
288898921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2889ccdfe979SStefano Zampini   }
289028b400f6SJacob Faibussowitsch   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2891ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2892ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
28939566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
28949566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
28959566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2896afb2bd1cSJunchao Zhang 
28979566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B,&blda));
2898c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
28999566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
29009566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2901c8378d12SStefano Zampini   } else {
29029566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
29039566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C,&clda));
2904c8378d12SStefano Zampini   }
2905c8378d12SStefano Zampini 
29069566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2907afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2908afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2909a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2910afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2911fcdce8c4SStefano Zampini     size_t mmBufferSize;
29129566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2913afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
29149566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2915afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2916afb2bd1cSJunchao Zhang     }
2917c8378d12SStefano Zampini 
29189566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2919afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
29209566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2921afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2922afb2bd1cSJunchao Zhang     }
2923afb2bd1cSJunchao Zhang 
2924afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2925afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2926afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2927afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2928afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2929afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
29309566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2931afb2bd1cSJunchao Zhang     }
2932afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2933afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2934afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
29359566063dSJacob Faibussowitsch                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2936fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
29379566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
29389566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2939fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2940fcdce8c4SStefano Zampini     }
2941afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2942afb2bd1cSJunchao Zhang   } else {
2943afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
29449566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
29459566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
29469566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2947afb2bd1cSJunchao Zhang   }
2948afb2bd1cSJunchao Zhang 
2949afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2950afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2951afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2952afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
29539566063dSJacob Faibussowitsch                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2954afb2bd1cSJunchao Zhang  #else
2955afb2bd1cSJunchao Zhang   PetscInt k;
2956afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2957ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2958ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2959ccdfe979SStefano Zampini     cublasStatus_t cerr;
2960ccdfe979SStefano Zampini 
29619566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2962ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2963ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2964ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2965ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
29669566063dSJacob Faibussowitsch                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2967ccdfe979SStefano Zampini     blda = B->cmap->n;
2968afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2969afb2bd1cSJunchao Zhang   } else {
2970afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2971ccdfe979SStefano Zampini   }
2972ccdfe979SStefano Zampini 
2973afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2974ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2975afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2976ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2977ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2978ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2979ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
29809566063dSJacob Faibussowitsch                            carray,clda);PetscCallCUSPARSE(stat);
2981afb2bd1cSJunchao Zhang  #endif
29829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
29839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
29849566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2985ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
29869566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
29879566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2988ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
29899566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
29909566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2991ccdfe979SStefano Zampini   } else {
29929566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2993ccdfe979SStefano Zampini   }
2994ccdfe979SStefano Zampini   if (mmdata->cisdense) {
29959566063dSJacob Faibussowitsch     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2996ccdfe979SStefano Zampini   }
2997ccdfe979SStefano Zampini   if (!biscuda) {
29989566063dSJacob Faibussowitsch     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2999ccdfe979SStefano Zampini   }
3000ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3001ccdfe979SStefano Zampini }
3002ccdfe979SStefano Zampini 
3003ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
3004ccdfe979SStefano Zampini {
3005ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
3006ccdfe979SStefano Zampini   Mat                A,B;
3007ccdfe979SStefano Zampini   PetscInt           m,n;
3008ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
3009ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
3010ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
3011ccdfe979SStefano Zampini 
3012ccdfe979SStefano Zampini   PetscFunctionBegin;
3013ccdfe979SStefano Zampini   MatCheckProduct(C,1);
301428b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3015ccdfe979SStefano Zampini   A    = product->A;
3016ccdfe979SStefano Zampini   B    = product->B;
30179566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
301828b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
3019ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
302008401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3021ccdfe979SStefano Zampini   switch (product->type) {
3022ccdfe979SStefano Zampini   case MATPRODUCT_AB:
3023ccdfe979SStefano Zampini     m = A->rmap->n;
3024ccdfe979SStefano Zampini     n = B->cmap->n;
3025ccdfe979SStefano Zampini     break;
3026ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
3027ccdfe979SStefano Zampini     m = A->cmap->n;
3028ccdfe979SStefano Zampini     n = B->cmap->n;
3029ccdfe979SStefano Zampini     break;
3030ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
3031ccdfe979SStefano Zampini     m = A->rmap->n;
3032ccdfe979SStefano Zampini     n = B->rmap->n;
3033ccdfe979SStefano Zampini     break;
3034ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
3035ccdfe979SStefano Zampini     m = B->cmap->n;
3036ccdfe979SStefano Zampini     n = B->cmap->n;
3037ccdfe979SStefano Zampini     break;
3038ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
3039ccdfe979SStefano Zampini     m = B->rmap->n;
3040ccdfe979SStefano Zampini     n = B->rmap->n;
3041ccdfe979SStefano Zampini     break;
3042ccdfe979SStefano Zampini   default:
304398921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3044ccdfe979SStefano Zampini   }
30459566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
3046ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
30479566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
30489566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQDENSECUDA));
3049ccdfe979SStefano Zampini 
3050ccdfe979SStefano Zampini   /* product data */
30519566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3052ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
3053afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
3054afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
3055ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
30569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
3057ccdfe979SStefano Zampini   }
3058afb2bd1cSJunchao Zhang  #endif
3059ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
3060ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
30619566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
30629566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
3063ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
30649566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
3065ccdfe979SStefano Zampini     } else {
30669566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
3067ccdfe979SStefano Zampini     }
3068ccdfe979SStefano Zampini   }
3069ccdfe979SStefano Zampini   C->product->data    = mmdata;
3070ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3071ccdfe979SStefano Zampini 
3072ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
3073ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3074ccdfe979SStefano Zampini }
3075ccdfe979SStefano Zampini 
3076fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3077ccdfe979SStefano Zampini {
3078ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
3079fcdce8c4SStefano Zampini   Mat                          A,B;
3080fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3081fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
3082fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3083fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3084fcdce8c4SStefano Zampini   PetscBool                    flg;
3085fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
3086fcdce8c4SStefano Zampini   MatProductType               ptype;
3087fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3088fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3089fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
3090fcdce8c4SStefano Zampini #endif
3091b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3092ccdfe979SStefano Zampini 
3093ccdfe979SStefano Zampini   PetscFunctionBegin;
3094ccdfe979SStefano Zampini   MatCheckProduct(C,1);
309528b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
30969566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
309728b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
3098fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
3099fcdce8c4SStefano Zampini   A = product->A;
3100fcdce8c4SStefano Zampini   B = product->B;
3101fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
3102fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
3103fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
310408401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3105fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
310628b400f6SJacob Faibussowitsch     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
3107fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
310828b400f6SJacob Faibussowitsch     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
3109fcdce8c4SStefano Zampini     goto finalize;
3110fcdce8c4SStefano Zampini   }
3111fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
31129566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
311328b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
31149566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
311528b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
311628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
311728b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
3118fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3119fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
3120fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
312108401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
312208401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
312308401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
31249566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
31259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3126fcdce8c4SStefano Zampini 
3127fcdce8c4SStefano Zampini   ptype = product->type;
3128fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3129fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
313028b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
3131fa046f9fSJunchao Zhang   }
3132fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3133fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
313428b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
3135fa046f9fSJunchao Zhang   }
3136fcdce8c4SStefano Zampini   switch (ptype) {
3137fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3138fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3139fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3140fcdce8c4SStefano Zampini     break;
3141fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3142fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3143fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3144fcdce8c4SStefano Zampini     break;
3145fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3146fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3147fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3148fcdce8c4SStefano Zampini     break;
3149fcdce8c4SStefano Zampini   default:
315098921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3151fcdce8c4SStefano Zampini   }
3152fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
315328b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
315428b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
315528b400f6SJacob Faibussowitsch   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
3156fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
3157fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
3158fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
315928b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
316028b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
316128b400f6SJacob Faibussowitsch   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
31629566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3163fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3164fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
31659566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3166b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3167b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3168b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3169b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
31709566063dSJacob Faibussowitsch                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3171b4285af6SJunchao Zhang   #else
3172b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3173fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3174fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
31759566063dSJacob Faibussowitsch                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3176b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3177fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
31789566063dSJacob Faibussowitsch                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3179b4285af6SJunchao Zhang   #endif
3180fcdce8c4SStefano Zampini #else
3181b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3182fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3183fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3184fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
31859566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3186fcdce8c4SStefano Zampini #endif
31879566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
31889566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
31899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3190fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
3191fcdce8c4SStefano Zampini finalize:
3192fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
31939566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
31949566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
31959566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
3196fcdce8c4SStefano Zampini   c->reallocs         = 0;
3197fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
3198fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
3199fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
3200fcdce8c4SStefano Zampini   C->num_ass++;
3201ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3202ccdfe979SStefano Zampini }
3203fcdce8c4SStefano Zampini 
3204fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
3205fcdce8c4SStefano Zampini {
3206fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
3207fcdce8c4SStefano Zampini   Mat                          A,B;
3208fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
3209fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
3210fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
3211fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3212fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
3213fcdce8c4SStefano Zampini   PetscBool                    flg;
3214fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
3215fcdce8c4SStefano Zampini   MatProductType               ptype;
3216fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
3217fcdce8c4SStefano Zampini   PetscLogDouble               flops;
3218fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
3219fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3220fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
3221fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
3222fcdce8c4SStefano Zampini #else
3223fcdce8c4SStefano Zampini   int                          cnz;
3224fcdce8c4SStefano Zampini #endif
3225b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
3226fcdce8c4SStefano Zampini 
3227fcdce8c4SStefano Zampini   PetscFunctionBegin;
3228fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
322928b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
3230fcdce8c4SStefano Zampini   A    = product->A;
3231fcdce8c4SStefano Zampini   B    = product->B;
32329566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
323328b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
32349566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
323528b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
3236fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
3237fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
3238fcdce8c4SStefano Zampini   /* product data */
32399566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
3240fcdce8c4SStefano Zampini   C->product->data    = mmdata;
3241fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
3242fcdce8c4SStefano Zampini 
32439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
32449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
3245d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
3246d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
324708401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
324808401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
3249d60bce21SJunchao Zhang 
3250fcdce8c4SStefano Zampini   ptype = product->type;
3251fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
3252fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
3253fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
3254fa046f9fSJunchao Zhang   }
3255fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
3256fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
3257fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
3258fa046f9fSJunchao Zhang   }
3259fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
3260fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
3261fcdce8c4SStefano Zampini   switch (ptype) {
3262fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
3263fcdce8c4SStefano Zampini     m = A->rmap->n;
3264fcdce8c4SStefano Zampini     n = B->cmap->n;
3265fcdce8c4SStefano Zampini     k = A->cmap->n;
3266fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3267fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3268fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3269fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3270fcdce8c4SStefano Zampini     break;
3271fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
3272fcdce8c4SStefano Zampini     m = A->cmap->n;
3273fcdce8c4SStefano Zampini     n = B->cmap->n;
3274fcdce8c4SStefano Zampini     k = A->rmap->n;
32759566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3276fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
3277fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
3278fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
3279fcdce8c4SStefano Zampini     break;
3280fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
3281fcdce8c4SStefano Zampini     m = A->rmap->n;
3282fcdce8c4SStefano Zampini     n = B->rmap->n;
3283fcdce8c4SStefano Zampini     k = A->cmap->n;
32849566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
3285fcdce8c4SStefano Zampini     Amat = Acusp->mat;
3286fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
3287fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
3288fcdce8c4SStefano Zampini     break;
3289fcdce8c4SStefano Zampini   default:
329098921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
3291fcdce8c4SStefano Zampini   }
3292fcdce8c4SStefano Zampini 
3293fcdce8c4SStefano Zampini   /* create cusparse matrix */
32949566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
32959566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
3296fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
3297fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
3298fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3299fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
3300fcdce8c4SStefano Zampini 
3301fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
3302fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
3303fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
33049566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
33059566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
3306fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
3307fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
3308fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
3309fcdce8c4SStefano Zampini   } else {
3310fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
3311fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
3312fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
3313fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
3314fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
3315fcdce8c4SStefano Zampini   }
3316fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
3317fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
3318fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
3319fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
3320fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
3321fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
33229566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
33239566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
33249566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
33259566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
33269566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
33279566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
33289566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
33299566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
33309566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
3331fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
3332fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
3333fcdce8c4SStefano Zampini     c->nz = 0;
3334fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3335fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3336fcdce8c4SStefano Zampini     goto finalizesym;
3337fcdce8c4SStefano Zampini   }
3338fcdce8c4SStefano Zampini 
333928b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
334028b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
3341fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
3342fcdce8c4SStefano Zampini   if (!biscompressed) {
3343fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
3344fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3345fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
3346fcdce8c4SStefano Zampini #endif
3347fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
3348fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
3349fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
3350fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
3351fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
3352fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
3353fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
3354fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
3355fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
3356fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3357fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
33589566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
3359fcdce8c4SStefano Zampini     }
3360fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
3361fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
3362fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3363fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
3364fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
3365fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
3366fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
3367fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
33689566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
3369fcdce8c4SStefano Zampini     }
3370fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
3371fcdce8c4SStefano Zampini #endif
3372fcdce8c4SStefano Zampini   }
337328b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
337428b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
3375fcdce8c4SStefano Zampini   /* precompute flops count */
3376fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
3377fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
3378fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
3379fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
3380fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
3381fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
3382fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
3383fcdce8c4SStefano Zampini       }
3384fcdce8c4SStefano Zampini     }
3385fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
3386fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
3387fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
3388fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
3389fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
3390fcdce8c4SStefano Zampini     }
3391fcdce8c4SStefano Zampini   } else { /* TODO */
3392fcdce8c4SStefano Zampini     flops = 0.;
3393fcdce8c4SStefano Zampini   }
3394fcdce8c4SStefano Zampini 
3395fcdce8c4SStefano Zampini   mmdata->flops = flops;
33969566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3397b4285af6SJunchao Zhang 
3398fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33999566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3400fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
3401fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
3402fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
34039566063dSJacob Faibussowitsch                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
34049566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
3405b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3406b4285af6SJunchao Zhang  {
3407b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
3408b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
3409b4285af6SJunchao Zhang   */
3410b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
3411b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
3412b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
3413b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
3414b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
3415b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
3416b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
3417b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
3418b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
3419b4285af6SJunchao Zhang 
3420b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
3421b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
3422b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3423b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
34249566063dSJacob Faibussowitsch                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
34259566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
3426b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
3427b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3428b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
34299566063dSJacob Faibussowitsch                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
3430b4285af6SJunchao Zhang 
3431b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
3432b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3433b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
34349566063dSJacob Faibussowitsch                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
34359566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
34369566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
34379566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
3438b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3439b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
34409566063dSJacob Faibussowitsch                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
34419566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer1));
34429566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer2));
3443b4285af6SJunchao Zhang 
3444b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
3445b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
34469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3447b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
3448b4285af6SJunchao Zhang   /* allocate matrix C */
34499566063dSJacob Faibussowitsch   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
34509566063dSJacob Faibussowitsch   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3451b4285af6SJunchao Zhang   /* update matC with the new pointers */
3452b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
34539566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3454b4285af6SJunchao Zhang 
3455b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
3456b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3457b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
34589566063dSJacob Faibussowitsch                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
34599566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
3460b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
3461b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
34629566063dSJacob Faibussowitsch                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
34639566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer3));
3464b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
3465b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3466b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
34679566063dSJacob Faibussowitsch                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
34689566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
3469b4285af6SJunchao Zhang  }
3470ae37ee31SJunchao Zhang  #else
3471b4285af6SJunchao Zhang   size_t bufSize2;
3472fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
3473b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3474fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3475fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
34769566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
34779566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
3478fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
3479b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
3480fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3481fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
34829566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
3483fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
3484b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3485fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3486fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
34879566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
3488fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
3489fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
3490fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
3491fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
3492fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
34939566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
3494fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
3495b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
3496fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
3497fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
34989566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
3499fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
35009566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
3501fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
35029566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
3503fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
35049566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3505fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
35069566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3507fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
35089566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
3509b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
3510fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
35119566063dSJacob Faibussowitsch                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
3512ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3513fcdce8c4SStefano Zampini #else
35149566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
3515b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
3516fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3517fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3518fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
35199566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
3520fcdce8c4SStefano Zampini   c->nz = cnz;
3521fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
35229566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3523fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
35249566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
3525fcdce8c4SStefano Zampini 
35269566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
3527fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
3528fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
3529fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
3530b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
3531fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
3532fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
3533fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
35349566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
3535fcdce8c4SStefano Zampini #endif
35369566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
35379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3538fcdce8c4SStefano Zampini finalizesym:
3539fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
3540fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
3541fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
35429566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m+1,&c->i));
35439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->j));
3544fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
3545fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3546fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
3547fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
3548fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
3549fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
3550fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
35519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
35529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3553fcdce8c4SStefano Zampini   } else {
3554fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
3555fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
35569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
35579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
3558fcdce8c4SStefano Zampini   }
3559fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
3560fcdce8c4SStefano Zampini     PetscInt r = 0;
3561fcdce8c4SStefano Zampini     c->i[0] = 0;
3562fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
3563fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
3564fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
3565fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
3566fcdce8c4SStefano Zampini     }
3567fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
3568fcdce8c4SStefano Zampini   }
35699566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
35709566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->ilen));
35719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->imax));
3572fcdce8c4SStefano Zampini   c->maxnz = c->nz;
3573fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
3574fcdce8c4SStefano Zampini   c->rmax = 0;
3575fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
3576fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
3577fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
3578fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
3579fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
3580fcdce8c4SStefano Zampini   }
35819566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
35829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->a));
3583fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
3584fcdce8c4SStefano Zampini 
3585fcdce8c4SStefano Zampini   C->nonzerostate++;
35869566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
35879566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
3588fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
3589fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
3590fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
3591fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
3592fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
3593abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
3594fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
3595fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
3596fcdce8c4SStefano Zampini   }
3597fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3598fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
3599fcdce8c4SStefano Zampini }
3600fcdce8c4SStefano Zampini 
3601fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
3602fcdce8c4SStefano Zampini 
3603fcdce8c4SStefano Zampini /* handles sparse or dense B */
3604fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
3605fcdce8c4SStefano Zampini {
3606fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
3607fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
3608fcdce8c4SStefano Zampini 
3609fcdce8c4SStefano Zampini   PetscFunctionBegin;
3610fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
36119566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
3612abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
36139566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
3614fcdce8c4SStefano Zampini   }
3615fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
3616fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
3617fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
36189566063dSJacob Faibussowitsch       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
3619fcdce8c4SStefano Zampini     }
3620fcdce8c4SStefano Zampini   }
362165e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
362265e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
362365e4b4d4SStefano Zampini     switch (product->type) {
362465e4b4d4SStefano Zampini     case MATPRODUCT_AB:
362565e4b4d4SStefano Zampini       if (product->api_user) {
3626d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
36279566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3628d0609cedSBarry Smith         PetscOptionsEnd();
362965e4b4d4SStefano Zampini       } else {
3630d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
36319566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
3632d0609cedSBarry Smith         PetscOptionsEnd();
363365e4b4d4SStefano Zampini       }
363465e4b4d4SStefano Zampini       break;
363565e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
363665e4b4d4SStefano Zampini       if (product->api_user) {
3637d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
36389566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3639d0609cedSBarry Smith         PetscOptionsEnd();
364065e4b4d4SStefano Zampini       } else {
3641d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
36429566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
3643d0609cedSBarry Smith         PetscOptionsEnd();
364465e4b4d4SStefano Zampini       }
364565e4b4d4SStefano Zampini       break;
364665e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
364765e4b4d4SStefano Zampini       if (product->api_user) {
3648d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
36499566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3650d0609cedSBarry Smith         PetscOptionsEnd();
365165e4b4d4SStefano Zampini       } else {
3652d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
36539566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
3654d0609cedSBarry Smith         PetscOptionsEnd();
365565e4b4d4SStefano Zampini       }
365665e4b4d4SStefano Zampini       break;
365765e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
365865e4b4d4SStefano Zampini       if (product->api_user) {
3659d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
36609566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3661d0609cedSBarry Smith         PetscOptionsEnd();
366265e4b4d4SStefano Zampini       } else {
3663d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
36649566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
3665d0609cedSBarry Smith         PetscOptionsEnd();
366665e4b4d4SStefano Zampini       }
366765e4b4d4SStefano Zampini       break;
366865e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
366965e4b4d4SStefano Zampini       if (product->api_user) {
3670d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
36719566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3672d0609cedSBarry Smith         PetscOptionsEnd();
367365e4b4d4SStefano Zampini       } else {
3674d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
36759566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
3676d0609cedSBarry Smith         PetscOptionsEnd();
367765e4b4d4SStefano Zampini       }
367865e4b4d4SStefano Zampini       break;
367965e4b4d4SStefano Zampini     default:
368065e4b4d4SStefano Zampini       break;
368165e4b4d4SStefano Zampini     }
368265e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
368365e4b4d4SStefano Zampini   }
368465e4b4d4SStefano Zampini   /* dispatch */
3685fcdce8c4SStefano Zampini   if (isdense) {
3686ccdfe979SStefano Zampini     switch (product->type) {
3687ccdfe979SStefano Zampini     case MATPRODUCT_AB:
3688ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
3689ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
3690ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
3691ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
3692fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
36939566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
3694fcdce8c4SStefano Zampini       } else {
3695fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
3696fcdce8c4SStefano Zampini       }
3697fcdce8c4SStefano Zampini       break;
3698fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
3699fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3700fcdce8c4SStefano Zampini       break;
3701ccdfe979SStefano Zampini     default:
3702ccdfe979SStefano Zampini       break;
3703ccdfe979SStefano Zampini     }
3704fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
3705fcdce8c4SStefano Zampini     switch (product->type) {
3706fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
3707fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
3708fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
3709fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
3710fcdce8c4SStefano Zampini       break;
3711fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
3712fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3713fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
3714fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3715fcdce8c4SStefano Zampini       break;
3716fcdce8c4SStefano Zampini     default:
3717fcdce8c4SStefano Zampini       break;
3718fcdce8c4SStefano Zampini     }
3719fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
37209566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
3721fcdce8c4SStefano Zampini   }
3722ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3723ccdfe979SStefano Zampini }
3724ccdfe979SStefano Zampini 
37256fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
37269ae82921SPaul Mullowney {
37279ae82921SPaul Mullowney   PetscFunctionBegin;
37289566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
3729e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3730e6e9a74fSStefano Zampini }
3731e6e9a74fSStefano Zampini 
3732e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3733e6e9a74fSStefano Zampini {
3734e6e9a74fSStefano Zampini   PetscFunctionBegin;
37359566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
3736e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3737e6e9a74fSStefano Zampini }
3738e6e9a74fSStefano Zampini 
3739e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3740e6e9a74fSStefano Zampini {
3741e6e9a74fSStefano Zampini   PetscFunctionBegin;
37429566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
3743e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3744e6e9a74fSStefano Zampini }
3745e6e9a74fSStefano Zampini 
3746e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3747e6e9a74fSStefano Zampini {
3748e6e9a74fSStefano Zampini   PetscFunctionBegin;
37499566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
37509ae82921SPaul Mullowney   PetscFunctionReturn(0);
37519ae82921SPaul Mullowney }
37529ae82921SPaul Mullowney 
37536fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3754ca45077fSPaul Mullowney {
3755ca45077fSPaul Mullowney   PetscFunctionBegin;
37569566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
3757ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3758ca45077fSPaul Mullowney }
3759ca45077fSPaul Mullowney 
3760a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3761a0e72f99SJunchao Zhang {
3762a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
3763a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3764a0e72f99SJunchao Zhang }
3765a0e72f99SJunchao Zhang 
3766afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3767e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
37689ae82921SPaul Mullowney {
37699ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3770aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
37719ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3772e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3773e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3774e6e9a74fSStefano Zampini   PetscBool                    compressed;
3775afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3776afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3777afb2bd1cSJunchao Zhang #endif
37786e111a19SKarl Rupp 
37799ae82921SPaul Mullowney   PetscFunctionBegin;
378008401ef6SPierre Jolivet   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3781cbc6b225SStefano Zampini   if (!a->nz) {
37829566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
37839566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3784e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3785e6e9a74fSStefano Zampini   }
378634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
37879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3788e6e9a74fSStefano Zampini   if (!trans) {
37899ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
37905f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3791e6e9a74fSStefano Zampini   } else {
37921a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3793e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3794e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3795e6e9a74fSStefano Zampini     } else {
37969566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3797e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3798e6e9a74fSStefano Zampini     }
3799e6e9a74fSStefano Zampini   }
3800e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3801e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3802213423ffSJunchao Zhang 
3803e6e9a74fSStefano Zampini   try {
38049566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
38059566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
38069566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3807afb2bd1cSJunchao Zhang 
38089566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3809e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3810afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3811afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3812afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3813afb2bd1cSJunchao Zhang       */
3814e6e9a74fSStefano Zampini       xptr = xarray;
3815afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3816213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3817afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3818afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3819afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3820afb2bd1cSJunchao Zhang        */
3821afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3822afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3823afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3824afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3825afb2bd1cSJunchao Zhang       }
3826afb2bd1cSJunchao Zhang      #endif
3827e6e9a74fSStefano Zampini     } else {
3828afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3829afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3830afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3831afb2bd1cSJunchao Zhang        */
3832afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3833e6e9a74fSStefano Zampini       dptr = zarray;
3834e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3835afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3836e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3837a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3838e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3839e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3840e6e9a74fSStefano Zampini       }
3841afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3842afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3843afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3844afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3845afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3846afb2bd1cSJunchao Zhang       }
3847afb2bd1cSJunchao Zhang      #endif
3848e6e9a74fSStefano Zampini     }
38499ae82921SPaul Mullowney 
3850afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3851aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3852afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
38535f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3854afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
38559566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
38569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
38579566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3858afb2bd1cSJunchao Zhang                                                matstruct->matDescr,
3859afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3860afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecYDescr,
3861afb2bd1cSJunchao Zhang                                                cusparse_scalartype,
3862afb2bd1cSJunchao Zhang                                                cusparsestruct->spmvAlg,
38635f80ce2aSJacob Faibussowitsch                                                &matstruct->cuSpMV[opA].spmvBufferSize));
38649566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3865afb2bd1cSJunchao Zhang 
3866afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3867afb2bd1cSJunchao Zhang       } else {
3868afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
38699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
38709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3871afb2bd1cSJunchao Zhang       }
3872afb2bd1cSJunchao Zhang 
38739566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3874afb2bd1cSJunchao Zhang                                   matstruct->alpha_one,
38753606e59fSJunchao Zhang                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3876afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecXDescr,
3877afb2bd1cSJunchao Zhang                                   beta,
3878afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecYDescr,
3879afb2bd1cSJunchao Zhang                                   cusparse_scalartype,
3880afb2bd1cSJunchao Zhang                                   cusparsestruct->spmvAlg,
38815f80ce2aSJacob Faibussowitsch                                   matstruct->cuSpMV[opA].spmvBuffer));
3882afb2bd1cSJunchao Zhang      #else
38837656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
38849566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3885a65300a6SPaul Mullowney                                        mat->num_rows, mat->num_cols,
3886afb2bd1cSJunchao Zhang                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3887aa372e3fSPaul Mullowney                                        mat->values->data().get(), mat->row_offsets->data().get(),
3888e6e9a74fSStefano Zampini                                        mat->column_indices->data().get(), xptr, beta,
38895f80ce2aSJacob Faibussowitsch                                        dptr));
3890afb2bd1cSJunchao Zhang      #endif
3891aa372e3fSPaul Mullowney     } else {
3892213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3893afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3894afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3895afb2bd1cSJunchao Zhang        #else
3896301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
38979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3898afb2bd1cSJunchao Zhang                                          matstruct->alpha_one, matstruct->descr, hybMat,
3899e6e9a74fSStefano Zampini                                          xptr, beta,
39005f80ce2aSJacob Faibussowitsch                                          dptr));
3901afb2bd1cSJunchao Zhang        #endif
3902a65300a6SPaul Mullowney       }
3903aa372e3fSPaul Mullowney     }
39049566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3905aa372e3fSPaul Mullowney 
3906e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3907213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3908213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
39099566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3910e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
39119566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
39127656d835SStefano Zampini         }
3913213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
39149566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz,0));
39157656d835SStefano Zampini       }
39167656d835SStefano Zampini 
3917213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3918213423ffSJunchao Zhang       if (compressed) {
39199566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3920a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3921a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3922a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3923a0e72f99SJunchao Zhang          */
3924a0e72f99SJunchao Zhang        #if 0
3925a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3926a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3927a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3928e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3929c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3930a0e72f99SJunchao Zhang        #else
3931a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3932a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3933a0e72f99SJunchao Zhang        #endif
39349566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3935e6e9a74fSStefano Zampini       }
3936e6e9a74fSStefano Zampini     } else {
3937e6e9a74fSStefano Zampini       if (yy && yy != zz) {
39389566063dSJacob Faibussowitsch         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3939e6e9a74fSStefano Zampini       }
3940e6e9a74fSStefano Zampini     }
39419566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
39429566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
39439566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
39449ae82921SPaul Mullowney   } catch(char *ex) {
394598921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
39469ae82921SPaul Mullowney   }
3947e6e9a74fSStefano Zampini   if (yy) {
39489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3949e6e9a74fSStefano Zampini   } else {
39509566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3951e6e9a74fSStefano Zampini   }
39529ae82921SPaul Mullowney   PetscFunctionReturn(0);
39539ae82921SPaul Mullowney }
39549ae82921SPaul Mullowney 
39556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3956ca45077fSPaul Mullowney {
3957ca45077fSPaul Mullowney   PetscFunctionBegin;
39589566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3959ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3960ca45077fSPaul Mullowney }
3961ca45077fSPaul Mullowney 
39626fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
39639ae82921SPaul Mullowney {
3964042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3965042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
39663fa6b06aSMark Adams 
3967042217e8SBarry Smith   PetscFunctionBegin;
39689566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3969042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3970042217e8SBarry Smith 
39719566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
39729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3973042217e8SBarry Smith     cusp->deviceMat = NULL;
3974042217e8SBarry Smith   }
39759ae82921SPaul Mullowney   PetscFunctionReturn(0);
39769ae82921SPaul Mullowney }
39779ae82921SPaul Mullowney 
39789ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3979e057df02SPaul Mullowney /*@
39809ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3981e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3982e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3983e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3984e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3985e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
39869ae82921SPaul Mullowney 
3987d083f849SBarry Smith    Collective
39889ae82921SPaul Mullowney 
39899ae82921SPaul Mullowney    Input Parameters:
39909ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
39919ae82921SPaul Mullowney .  m - number of rows
39929ae82921SPaul Mullowney .  n - number of columns
39939ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
39949ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
39950298fd71SBarry Smith          (possibly different for each row) or NULL
39969ae82921SPaul Mullowney 
39979ae82921SPaul Mullowney    Output Parameter:
39989ae82921SPaul Mullowney .  A - the matrix
39999ae82921SPaul Mullowney 
40009ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
40019ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
40029ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
40039ae82921SPaul Mullowney 
40049ae82921SPaul Mullowney    Notes:
40059ae82921SPaul Mullowney    If nnz is given then nz is ignored
40069ae82921SPaul Mullowney 
40079ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
40089ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
40099ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
40109ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
40119ae82921SPaul Mullowney 
40129ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
40130298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
40149ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
40159ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
40169ae82921SPaul Mullowney 
40179ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
40189ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
40199ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
40209ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
40219ae82921SPaul Mullowney 
40229ae82921SPaul Mullowney    Level: intermediate
40239ae82921SPaul Mullowney 
4024db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATSEQAIJCUSPARSE`, `MATAIJCUSPARSE`
40259ae82921SPaul Mullowney @*/
40269ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
40279ae82921SPaul Mullowney {
40289ae82921SPaul Mullowney   PetscFunctionBegin;
40299566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
40309566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
40319566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
40329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
40339ae82921SPaul Mullowney   PetscFunctionReturn(0);
40349ae82921SPaul Mullowney }
40359ae82921SPaul Mullowney 
40366fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
40379ae82921SPaul Mullowney {
40389ae82921SPaul Mullowney   PetscFunctionBegin;
40399ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
40409566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
40419ae82921SPaul Mullowney   } else {
40429566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
4043aa372e3fSPaul Mullowney   }
40449566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
40459566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
40469566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
40479566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
40489566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
40499566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
40509566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
40519566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
40529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
40539566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
40549566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
40559ae82921SPaul Mullowney   PetscFunctionReturn(0);
40569ae82921SPaul Mullowney }
40579ae82921SPaul Mullowney 
4058ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
405995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
40609ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
40619ff858a8SKarl Rupp {
40629ff858a8SKarl Rupp   PetscFunctionBegin;
40639566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
40649566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
40659ff858a8SKarl Rupp   PetscFunctionReturn(0);
40669ff858a8SKarl Rupp }
40679ff858a8SKarl Rupp 
4068039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
406995639643SRichard Tran Mills {
4070a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
4071039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
4072039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
4073039c6fbaSStefano Zampini   PetscScalar        *ay;
4074039c6fbaSStefano Zampini   const PetscScalar  *ax;
4075039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
4076e6e9a74fSStefano Zampini 
407795639643SRichard Tran Mills   PetscFunctionBegin;
4078a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
4079a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
4080039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
40819566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
40829566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4083a587d139SMark     PetscFunctionReturn(0);
408495639643SRichard Tran Mills   }
4085039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
40869566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
40879566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
40885f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
40895f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
4090039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
4091039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
4092039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
4093039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
4094039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
4095039c6fbaSStefano Zampini     if (eq) {
4096039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
4097039c6fbaSStefano Zampini     }
4098039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
4099039c6fbaSStefano Zampini   }
4100d2be01edSStefano Zampini   /* spgeam is buggy with one column */
4101d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
4102039c6fbaSStefano Zampini 
4103039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
4104039c6fbaSStefano Zampini     PetscScalar b = 1.0;
4105039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4106039c6fbaSStefano Zampini     size_t      bufferSize;
4107039c6fbaSStefano Zampini     void        *buffer;
4108039c6fbaSStefano Zampini #endif
4109039c6fbaSStefano Zampini 
41109566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
41119566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
41129566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
4113039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
41149566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
4115039c6fbaSStefano Zampini                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4116039c6fbaSStefano Zampini                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
41175f80ce2aSJacob Faibussowitsch                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
41189566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
41199566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41209566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4121039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4122039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
41235f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
41249566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
41259566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41269566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
4127039c6fbaSStefano Zampini #else
41289566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41299566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
4130039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
4131039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
41325f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
41339566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
41349566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
4135039c6fbaSStefano Zampini #endif
41369566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
41379566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
41389566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
41399566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4140039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
4141a587d139SMark     cublasHandle_t cublasv2handle;
4142a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
4143039c6fbaSStefano Zampini 
41449566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
41459566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
41469566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
41479566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz,&bnz));
41489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
41499566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
41509566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*bnz));
41519566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
41529566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
41539566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
41549566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
4155039c6fbaSStefano Zampini   } else {
41569566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
41579566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
4158a587d139SMark   }
415995639643SRichard Tran Mills   PetscFunctionReturn(0);
416095639643SRichard Tran Mills }
416195639643SRichard Tran Mills 
416233c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
416333c9ba73SStefano Zampini {
416433c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
416533c9ba73SStefano Zampini   PetscScalar    *ay;
416633c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
416733c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
416833c9ba73SStefano Zampini 
416933c9ba73SStefano Zampini   PetscFunctionBegin;
41709566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
41719566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
41729566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz,&bnz));
41739566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
41749566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
41759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
41769566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
41779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
41789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
417933c9ba73SStefano Zampini   PetscFunctionReturn(0);
418033c9ba73SStefano Zampini }
418133c9ba73SStefano Zampini 
41823fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
41833fa6b06aSMark Adams {
41847e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
4185a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
41867e8381f9SStefano Zampini 
41873fa6b06aSMark Adams   PetscFunctionBegin;
41883fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
41893fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
41907e8381f9SStefano Zampini     if (spptr->mat) {
41917e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
41927e8381f9SStefano Zampini       if (matrix->values) {
41937e8381f9SStefano Zampini         both = PETSC_TRUE;
41947e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
41957e8381f9SStefano Zampini       }
41967e8381f9SStefano Zampini     }
41977e8381f9SStefano Zampini     if (spptr->matTranspose) {
41987e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
41997e8381f9SStefano Zampini       if (matrix->values) {
42007e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
42017e8381f9SStefano Zampini       }
42027e8381f9SStefano Zampini     }
42033fa6b06aSMark Adams   }
42049566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
42059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
42067e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
4207a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
42083fa6b06aSMark Adams   PetscFunctionReturn(0);
42093fa6b06aSMark Adams }
42103fa6b06aSMark Adams 
4211a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
4212a587d139SMark {
4213a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
4214a587d139SMark 
4215a587d139SMark   PetscFunctionBegin;
42169a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
42179a14fc28SStefano Zampini     A->boundtocpu = flg;
42189a14fc28SStefano Zampini     PetscFunctionReturn(0);
42199a14fc28SStefano Zampini   }
4220a587d139SMark   if (flg) {
42219566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
4222a587d139SMark 
422333c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
4224a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
4225a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
4226a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
4227a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
4228a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
4229a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
4230a587d139SMark     A->ops->multhermitiantranspose    = NULL;
4231a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
4232fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
42339566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
42349566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
42359566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
42369566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
42379566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
42389566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
42399566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
4240a587d139SMark   } else {
424133c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
4242a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
4243a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
4244a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
4245a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
4246a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
4247a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
4248a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
4249a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
4250fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
425167a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
425267a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
425367a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
425467a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
425567a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
425667a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
42577ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
42587ee59b9bSJunchao Zhang 
42599566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
42609566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
42619566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
42629566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
42639566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
42649566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
4265a587d139SMark    }
4266a587d139SMark   A->boundtocpu = flg;
4267ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
4268ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
4269ea500dcfSRichard Tran Mills   } else {
4270ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
4271ea500dcfSRichard Tran Mills   }
4272a587d139SMark   PetscFunctionReturn(0);
4273a587d139SMark }
4274a587d139SMark 
427549735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
42769ae82921SPaul Mullowney {
427749735bf3SStefano Zampini   Mat              B;
42789ae82921SPaul Mullowney 
42799ae82921SPaul Mullowney   PetscFunctionBegin;
42809566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
428149735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
42829566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
428349735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
42849566063dSJacob Faibussowitsch     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
428549735bf3SStefano Zampini   }
428649735bf3SStefano Zampini   B = *newmat;
428749735bf3SStefano Zampini 
42889566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
42899566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
429034136279SStefano Zampini 
429149735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
42929ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
4293e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
42949566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
42959566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
42969566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
42971a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
4298d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4299ba986b86SSatish Balay      #if CUSPARSE_VERSION > 11301
4300a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
4301a435da06SStefano Zampini      #else
4302d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
4303a435da06SStefano Zampini      #endif
4304d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
4305d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
4306d8132acaSStefano Zampini      #endif
43071a2c6b5cSJunchao Zhang       B->spptr = spptr;
43089ae82921SPaul Mullowney     } else {
4309e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
4310e6e9a74fSStefano Zampini 
43119566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
43129566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
43139566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
4314e6e9a74fSStefano Zampini       B->spptr = spptr;
43159ae82921SPaul Mullowney     }
4316e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
431749735bf3SStefano Zampini   }
4318693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
43199ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
43201a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
43219ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
432295639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
4323693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
43242205254eSKarl Rupp 
43259566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
43269566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
43279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
4328ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
43299566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
4330ae48a8d0SStefano Zampini #endif
43319566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
43329ae82921SPaul Mullowney   PetscFunctionReturn(0);
43339ae82921SPaul Mullowney }
43349ae82921SPaul Mullowney 
433502fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
433602fe1965SBarry Smith {
433702fe1965SBarry Smith   PetscFunctionBegin;
43389566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
43399566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
434002fe1965SBarry Smith   PetscFunctionReturn(0);
434102fe1965SBarry Smith }
434202fe1965SBarry Smith 
43433ca39a21SBarry Smith /*MC
4344e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
4345e057df02SPaul Mullowney 
4346e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
43472692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
43482692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
4349e057df02SPaul Mullowney 
4350e057df02SPaul Mullowney    Options Database Keys:
4351e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
4352aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4353a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4354365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
4355e057df02SPaul Mullowney 
4356e057df02SPaul Mullowney   Level: beginner
4357e057df02SPaul Mullowney 
4358db781477SPatrick Sanan .seealso: `MatCreateSeqAIJCUSPARSE()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
4359e057df02SPaul Mullowney M*/
43607f756511SDominic Meiser 
4361bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
43620f39cd5aSBarry Smith 
43633ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
436442c9c57cSBarry Smith {
436542c9c57cSBarry Smith   PetscFunctionBegin;
43669566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
43679566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
43689566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
43699566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
43709566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
4371bddcd29dSMark Adams 
437242c9c57cSBarry Smith   PetscFunctionReturn(0);
437342c9c57cSBarry Smith }
437429b38603SBarry Smith 
4375cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
4376cbc6b225SStefano Zampini {
4377cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
4378cbc6b225SStefano Zampini 
4379cbc6b225SStefano Zampini   PetscFunctionBegin;
4380cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4381cbc6b225SStefano Zampini   delete cusp->cooPerm;
4382cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
4383cbc6b225SStefano Zampini   cusp->cooPerm = NULL;
4384cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
4385cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
43869566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
43879566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
4388cbc6b225SStefano Zampini   }
4389cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
4390cbc6b225SStefano Zampini   PetscFunctionReturn(0);
4391cbc6b225SStefano Zampini }
4392cbc6b225SStefano Zampini 
4393470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
43947f756511SDominic Meiser {
43957f756511SDominic Meiser   PetscFunctionBegin;
43967f756511SDominic Meiser   if (*cusparsestruct) {
43979566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
43989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
43997f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
440081902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
44017e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
44027e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
4403a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
44049566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
44059566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
44069566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
44079566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
44087f756511SDominic Meiser   }
44097f756511SDominic Meiser   PetscFunctionReturn(0);
44107f756511SDominic Meiser }
44117f756511SDominic Meiser 
44127f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
44137f756511SDominic Meiser {
44147f756511SDominic Meiser   PetscFunctionBegin;
44157f756511SDominic Meiser   if (*mat) {
44167f756511SDominic Meiser     delete (*mat)->values;
44177f756511SDominic Meiser     delete (*mat)->column_indices;
44187f756511SDominic Meiser     delete (*mat)->row_offsets;
44197f756511SDominic Meiser     delete *mat;
44207f756511SDominic Meiser     *mat = 0;
44217f756511SDominic Meiser   }
44227f756511SDominic Meiser   PetscFunctionReturn(0);
44237f756511SDominic Meiser }
44247f756511SDominic Meiser 
4425470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
44267f756511SDominic Meiser {
44277f756511SDominic Meiser   PetscFunctionBegin;
44287f756511SDominic Meiser   if (*trifactor) {
44299566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
4430261a78b4SJunchao Zhang     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo));
44319566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
44329566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
44339566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
4434afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
44359566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
4436afb2bd1cSJunchao Zhang    #endif
44379566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
44387f756511SDominic Meiser   }
44397f756511SDominic Meiser   PetscFunctionReturn(0);
44407f756511SDominic Meiser }
44417f756511SDominic Meiser 
4442470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
44437f756511SDominic Meiser {
44447f756511SDominic Meiser   CsrMatrix        *mat;
44457f756511SDominic Meiser 
44467f756511SDominic Meiser   PetscFunctionBegin;
44477f756511SDominic Meiser   if (*matstruct) {
44487f756511SDominic Meiser     if ((*matstruct)->mat) {
44497f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
4450afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4451afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
4452afb2bd1cSJunchao Zhang        #else
44537f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
44549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
4455afb2bd1cSJunchao Zhang        #endif
44567f756511SDominic Meiser       } else {
44577f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
44587f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
44597f756511SDominic Meiser       }
44607f756511SDominic Meiser     }
44619566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
44627f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
44639566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
44649566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
44659566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
4466afb2bd1cSJunchao Zhang 
4467afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4468afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
44699566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
4470afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
4471afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
44729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
44739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
44749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
4475afb2bd1cSJunchao Zhang       }
4476afb2bd1cSJunchao Zhang     }
4477afb2bd1cSJunchao Zhang    #endif
44787f756511SDominic Meiser     delete *matstruct;
44797e8381f9SStefano Zampini     *matstruct = NULL;
44807f756511SDominic Meiser   }
44817f756511SDominic Meiser   PetscFunctionReturn(0);
44827f756511SDominic Meiser }
44837f756511SDominic Meiser 
4484e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
44857f756511SDominic Meiser {
4486da112707SJunchao Zhang   Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors;
4487da112707SJunchao Zhang 
44887f756511SDominic Meiser   PetscFunctionBegin;
4489da112707SJunchao Zhang   if (fs) {
4490da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr));
4491da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr));
4492da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose));
4493da112707SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose));
4494da112707SJunchao Zhang     delete fs->rpermIndices;
4495da112707SJunchao Zhang     delete fs->cpermIndices;
4496da112707SJunchao Zhang     delete fs->workVector;
4497da112707SJunchao Zhang     fs->rpermIndices = NULL;
4498da112707SJunchao Zhang     fs->cpermIndices = NULL;
4499da112707SJunchao Zhang     fs->workVector = NULL;
4500da112707SJunchao Zhang     if (fs->a_band_d)   PetscCallCUDA(cudaFree(fs->a_band_d));
4501da112707SJunchao Zhang     if (fs->i_band_d)   PetscCallCUDA(cudaFree(fs->i_band_d));
4502da112707SJunchao Zhang     fs->init_dev_prop = PETSC_FALSE;
4503da112707SJunchao Zhang    #if CUSPARSE_VERSION >= 11500
4504da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrRowPtr));
4505da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrColIdx));
4506da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->csrVal));
4507da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->X));
4508da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->Y));
450912ba2bc6SJunchao Zhang     // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */
4510da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_L));
4511da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_U));
451212ba2bc6SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt));
4513da112707SJunchao Zhang     PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut));
4514da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M));
4515da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L));
4516da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U));
4517da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L));
4518da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt));
4519da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U));
4520da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut));
4521da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X));
4522da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y));
4523da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M));
4524da112707SJunchao Zhang     PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M));
452512ba2bc6SJunchao Zhang 
452612ba2bc6SJunchao Zhang     fs->createdTransposeSpSVDescr    = PETSC_FALSE;
452712ba2bc6SJunchao Zhang     fs->updatedTransposeSpSVAnalysis = PETSC_FALSE;
4528da112707SJunchao Zhang    #endif
4529ccdfe979SStefano Zampini   }
4530ccdfe979SStefano Zampini   PetscFunctionReturn(0);
4531ccdfe979SStefano Zampini }
4532ccdfe979SStefano Zampini 
4533ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
4534ccdfe979SStefano Zampini {
4535ccdfe979SStefano Zampini   cusparseHandle_t handle;
4536ccdfe979SStefano Zampini 
4537ccdfe979SStefano Zampini   PetscFunctionBegin;
4538ccdfe979SStefano Zampini   if (*trifactors) {
45399566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
45407f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
45419566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseDestroy(handle));
45427f756511SDominic Meiser     }
45439566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
45447f756511SDominic Meiser   }
45457f756511SDominic Meiser   PetscFunctionReturn(0);
45467f756511SDominic Meiser }
45477e8381f9SStefano Zampini 
45487e8381f9SStefano Zampini struct IJCompare
45497e8381f9SStefano Zampini {
45507e8381f9SStefano Zampini   __host__ __device__
45517e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
45527e8381f9SStefano Zampini   {
45537e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
45547e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
45557e8381f9SStefano Zampini     return false;
45567e8381f9SStefano Zampini   }
45577e8381f9SStefano Zampini };
45587e8381f9SStefano Zampini 
45597e8381f9SStefano Zampini struct IJEqual
45607e8381f9SStefano Zampini {
45617e8381f9SStefano Zampini   __host__ __device__
45627e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
45637e8381f9SStefano Zampini   {
45647e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
45657e8381f9SStefano Zampini     return true;
45667e8381f9SStefano Zampini   }
45677e8381f9SStefano Zampini };
45687e8381f9SStefano Zampini 
45697e8381f9SStefano Zampini struct IJDiff
45707e8381f9SStefano Zampini {
45717e8381f9SStefano Zampini   __host__ __device__
45727e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
45737e8381f9SStefano Zampini   {
45747e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
45757e8381f9SStefano Zampini   }
45767e8381f9SStefano Zampini };
45777e8381f9SStefano Zampini 
45787e8381f9SStefano Zampini struct IJSum
45797e8381f9SStefano Zampini {
45807e8381f9SStefano Zampini   __host__ __device__
45817e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
45827e8381f9SStefano Zampini   {
45837e8381f9SStefano Zampini     return t1||t2;
45847e8381f9SStefano Zampini   }
45857e8381f9SStefano Zampini };
45867e8381f9SStefano Zampini 
45877e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
4588219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
4589219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
45907e8381f9SStefano Zampini {
45917e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4592fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
4593bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
459408391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
45957e8381f9SStefano Zampini   CsrMatrix                             *matrix;
45967e8381f9SStefano Zampini   PetscInt                              n;
45977e8381f9SStefano Zampini 
45987e8381f9SStefano Zampini   PetscFunctionBegin;
459928b400f6SJacob Faibussowitsch   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
460028b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
46017e8381f9SStefano Zampini   if (!cusp->cooPerm) {
46029566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
46039566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
46047e8381f9SStefano Zampini     PetscFunctionReturn(0);
46057e8381f9SStefano Zampini   }
46067e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
460728b400f6SJacob Faibussowitsch   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4608e61fc153SStefano Zampini   if (!v) {
4609e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
4610e61fc153SStefano Zampini     goto finalize;
46117e8381f9SStefano Zampini   }
4612e61fc153SStefano Zampini   n = cusp->cooPerm->size();
461308391a17SStefano Zampini   if (isCudaMem(v)) {
461408391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
461508391a17SStefano Zampini   } else {
4616e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
4617e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
461808391a17SStefano Zampini     d_v = cooPerm_v->data();
46199566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
462008391a17SStefano Zampini   }
46219566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
4622e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
4623ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
4624bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
462508391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4626ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
4627ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
4628ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
4629ddea5d60SJunchao Zhang       */
4630e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
4631e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
4632e61fc153SStefano Zampini       delete cooPerm_w;
46337e8381f9SStefano Zampini     } else {
4634ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
463508391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
46367e8381f9SStefano Zampini                                                                 matrix->values->begin()));
463708391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
46387e8381f9SStefano Zampini                                                                 matrix->values->end()));
4639ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
46407e8381f9SStefano Zampini     }
46417e8381f9SStefano Zampini   } else {
4642e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
464308391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
4644e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
46457e8381f9SStefano Zampini     } else {
464608391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
46477e8381f9SStefano Zampini                                                                 matrix->values->begin()));
464808391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
46497e8381f9SStefano Zampini                                                                 matrix->values->end()));
46507e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
46517e8381f9SStefano Zampini     }
46527e8381f9SStefano Zampini   }
46539566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
4654e61fc153SStefano Zampini finalize:
4655e61fc153SStefano Zampini   delete cooPerm_v;
46567e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
46579566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4658fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
46599566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
46609566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
46619566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
4662fcdce8c4SStefano Zampini   a->reallocs         = 0;
4663fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
4664fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
4665fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
4666fcdce8c4SStefano Zampini   A->num_ass++;
46677e8381f9SStefano Zampini   PetscFunctionReturn(0);
46687e8381f9SStefano Zampini }
46697e8381f9SStefano Zampini 
4670a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
4671a49f1ed0SStefano Zampini {
4672a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4673a49f1ed0SStefano Zampini 
4674a49f1ed0SStefano Zampini   PetscFunctionBegin;
4675a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4676a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
4677a49f1ed0SStefano Zampini   if (destroy) {
46789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
4679a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
4680a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
4681a49f1ed0SStefano Zampini   }
46821a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
4683a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
4684a49f1ed0SStefano Zampini }
4685a49f1ed0SStefano Zampini 
46867e8381f9SStefano Zampini #include <thrust/binary_search.h>
4687219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
4688219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
46897e8381f9SStefano Zampini {
46907e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
46917e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
46927e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
46937e8381f9SStefano Zampini 
46947e8381f9SStefano Zampini   PetscFunctionBegin;
46959566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
46969566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
46977e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
46987e8381f9SStefano Zampini   if (n != cooPerm_n) {
46997e8381f9SStefano Zampini     delete cusp->cooPerm;
47007e8381f9SStefano Zampini     delete cusp->cooPerm_a;
47017e8381f9SStefano Zampini     cusp->cooPerm = NULL;
47027e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
47037e8381f9SStefano Zampini   }
47047e8381f9SStefano Zampini   if (n) {
47057e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
47067e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
47077e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
47087e8381f9SStefano Zampini 
47097e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
47107e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
47117e8381f9SStefano Zampini 
47129566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
47137e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
47147e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
4715ddea5d60SJunchao Zhang 
4716ddea5d60SJunchao Zhang     /* Ex.
4717ddea5d60SJunchao Zhang       n = 6
4718ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4719ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4720ddea5d60SJunchao Zhang     */
47217e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
47227e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
47237e8381f9SStefano Zampini 
47249566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
47257e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4726ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4727ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
47287e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
47297e8381f9SStefano Zampini 
4730ddea5d60SJunchao Zhang     /*
4731ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4732ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4733ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4734ddea5d60SJunchao Zhang     */
4735ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4736ddea5d60SJunchao Zhang 
4737ddea5d60SJunchao Zhang     /*
4738ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4739ddea5d60SJunchao Zhang                             ^ekey
4740ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4741ddea5d60SJunchao Zhang                            ^nekye
4742ddea5d60SJunchao Zhang     */
47437e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
47447e8381f9SStefano Zampini       delete cusp->cooPerm_a;
47457e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4746ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4747ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4748ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4749ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4750ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
47517e8381f9SStefano Zampini       w[0] = 0;
4752ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4753ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
47547e8381f9SStefano Zampini     }
47557e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4756ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4757ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4758ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
47599566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
47607e8381f9SStefano Zampini 
47619566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
47627e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
47637e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
47647e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
47659566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
4766ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
47679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
47687e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4769fcdce8c4SStefano Zampini     a->rmax = 0;
47709566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->a));
47719566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->j));
47729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
47739566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
47749566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
47757e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
47767e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
47777e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
47787e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4779fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
47807e8381f9SStefano Zampini     }
4781fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
47827e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
47839566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
47849566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
47857e8381f9SStefano Zampini   } else {
47869566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
47877e8381f9SStefano Zampini   }
47889566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
47897e8381f9SStefano Zampini 
47907e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4791e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
47929566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->nz));
47939566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
47947e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
47959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
47969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
47977e8381f9SStefano Zampini   PetscFunctionReturn(0);
47987e8381f9SStefano Zampini }
4799ed502f03SStefano Zampini 
4800219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4801219fbbafSJunchao Zhang {
4802219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4803219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4804cbc6b225SStefano Zampini   PetscBool          coo_basic = PETSC_TRUE;
4805219fbbafSJunchao Zhang   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4806219fbbafSJunchao Zhang 
4807219fbbafSJunchao Zhang   PetscFunctionBegin;
48089566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
48099566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4810219fbbafSJunchao Zhang   if (coo_i) {
48119566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i,&mtype));
4812219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4813219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) {
4814cbc6b225SStefano Zampini         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4815219fbbafSJunchao Zhang       }
4816219fbbafSJunchao Zhang     }
4817219fbbafSJunchao Zhang   }
4818219fbbafSJunchao Zhang 
4819219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
48209566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4821219fbbafSJunchao Zhang   } else {
48229566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4823cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
48249566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4825219fbbafSJunchao Zhang     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4826219fbbafSJunchao Zhang     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
48279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
48289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
48299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
48309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4831219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4832219fbbafSJunchao Zhang   }
4833219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4834219fbbafSJunchao Zhang }
4835219fbbafSJunchao Zhang 
483677804d84SJunchao Zhang __global__ static void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4837219fbbafSJunchao Zhang {
4838219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4839219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
4840b6c38306SJunchao Zhang   for (; i<nnz; i+= grid_size) {
4841b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4842b6c38306SJunchao Zhang     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4843b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4844b6c38306SJunchao Zhang   }
4845219fbbafSJunchao Zhang }
4846219fbbafSJunchao Zhang 
4847219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4848219fbbafSJunchao Zhang {
4849219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4850219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4851219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4852219fbbafSJunchao Zhang   PetscMemType        memtype;
4853219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4854219fbbafSJunchao Zhang   PetscScalar         *Aa;
4855219fbbafSJunchao Zhang 
4856219fbbafSJunchao Zhang   PetscFunctionBegin;
4857219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
48589566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v,&memtype));
4859219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
48609566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
48619566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4862219fbbafSJunchao Zhang     }
4863219fbbafSJunchao Zhang 
48649566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
48659566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4866219fbbafSJunchao Zhang 
4867cbc6b225SStefano Zampini     if (Annz) {
4868b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
48699566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4870cbc6b225SStefano Zampini     }
4871219fbbafSJunchao Zhang 
48729566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
48739566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4874219fbbafSJunchao Zhang 
48759566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4876219fbbafSJunchao Zhang   } else {
48779566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4878219fbbafSJunchao Zhang   }
4879219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4880219fbbafSJunchao Zhang }
4881219fbbafSJunchao Zhang 
48825b7e41feSStefano Zampini /*@C
48835b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
48845b7e41feSStefano Zampini 
48855b7e41feSStefano Zampini    Not collective
48865b7e41feSStefano Zampini 
48875b7e41feSStefano Zampini     Input Parameters:
48885b7e41feSStefano Zampini +   A - the matrix
48895b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
48905b7e41feSStefano Zampini 
48915b7e41feSStefano Zampini     Output Parameters:
48925b7e41feSStefano Zampini +   ia - the CSR row pointers
48935b7e41feSStefano Zampini -   ja - the CSR column indices
48945b7e41feSStefano Zampini 
48955b7e41feSStefano Zampini     Level: developer
48965b7e41feSStefano Zampini 
48975b7e41feSStefano Zampini     Notes:
48985b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
48995b7e41feSStefano Zampini 
4900db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()`
49015b7e41feSStefano Zampini @*/
49025f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
49035f101d05SStefano Zampini {
49045f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
49055f101d05SStefano Zampini   CsrMatrix          *csr;
49065f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
49075f101d05SStefano Zampini 
49085f101d05SStefano Zampini   PetscFunctionBegin;
49095f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
49105f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
49115f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4912aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
49139566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
491428b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
49155f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
49165f101d05SStefano Zampini   if (i) {
49175f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
49185f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
49195f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
49205f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
49219566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
49225f101d05SStefano Zampini       }
49235f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
49245f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
49255f101d05SStefano Zampini   }
49265f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
49275f101d05SStefano Zampini   PetscFunctionReturn(0);
49285f101d05SStefano Zampini }
49295f101d05SStefano Zampini 
49305b7e41feSStefano Zampini /*@C
49315b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
49325b7e41feSStefano Zampini 
49335b7e41feSStefano Zampini    Not collective
49345b7e41feSStefano Zampini 
49355b7e41feSStefano Zampini     Input Parameters:
49365b7e41feSStefano Zampini +   A - the matrix
49375b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
49385b7e41feSStefano Zampini 
49395b7e41feSStefano Zampini     Output Parameters:
49405b7e41feSStefano Zampini +   ia - the CSR row pointers
49415b7e41feSStefano Zampini -   ja - the CSR column indices
49425b7e41feSStefano Zampini 
49435b7e41feSStefano Zampini     Level: developer
49445b7e41feSStefano Zampini 
4945db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetIJ()`
49465b7e41feSStefano Zampini @*/
49475f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
49485f101d05SStefano Zampini {
49495f101d05SStefano Zampini   PetscFunctionBegin;
49505f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
49515f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
49525f101d05SStefano Zampini   if (i) *i = NULL;
49535f101d05SStefano Zampini   if (j) *j = NULL;
49545f101d05SStefano Zampini   PetscFunctionReturn(0);
49555f101d05SStefano Zampini }
49565f101d05SStefano Zampini 
49575b7e41feSStefano Zampini /*@C
49585b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
49595b7e41feSStefano Zampini 
49605b7e41feSStefano Zampini    Not Collective
49615b7e41feSStefano Zampini 
49625b7e41feSStefano Zampini    Input Parameter:
49635b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
49645b7e41feSStefano Zampini 
49655b7e41feSStefano Zampini    Output Parameter:
49665b7e41feSStefano Zampini .   a - pointer to the device data
49675b7e41feSStefano Zampini 
49685b7e41feSStefano Zampini    Level: developer
49695b7e41feSStefano Zampini 
49705b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
49715b7e41feSStefano Zampini 
4972db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()`
49735b7e41feSStefano Zampini @*/
4974ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4975ed502f03SStefano Zampini {
4976ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4977ed502f03SStefano Zampini   CsrMatrix          *csr;
4978ed502f03SStefano Zampini 
4979ed502f03SStefano Zampini   PetscFunctionBegin;
4980ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4981ed502f03SStefano Zampini   PetscValidPointer(a,2);
4982ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4983aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
49849566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
498528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4986ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
498728b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4988ed502f03SStefano Zampini   *a = csr->values->data().get();
4989ed502f03SStefano Zampini   PetscFunctionReturn(0);
4990ed502f03SStefano Zampini }
4991ed502f03SStefano Zampini 
49925b7e41feSStefano Zampini /*@C
49935b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
49945b7e41feSStefano Zampini 
49955b7e41feSStefano Zampini    Not Collective
49965b7e41feSStefano Zampini 
49975b7e41feSStefano Zampini    Input Parameter:
49985b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
49995b7e41feSStefano Zampini 
50005b7e41feSStefano Zampini    Output Parameter:
50015b7e41feSStefano Zampini .   a - pointer to the device data
50025b7e41feSStefano Zampini 
50035b7e41feSStefano Zampini    Level: developer
50045b7e41feSStefano Zampini 
5005db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`
50065b7e41feSStefano Zampini @*/
5007ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
5008ed502f03SStefano Zampini {
5009ed502f03SStefano Zampini   PetscFunctionBegin;
5010ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5011ed502f03SStefano Zampini   PetscValidPointer(a,2);
5012ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5013ed502f03SStefano Zampini   *a = NULL;
5014ed502f03SStefano Zampini   PetscFunctionReturn(0);
5015ed502f03SStefano Zampini }
5016ed502f03SStefano Zampini 
50175b7e41feSStefano Zampini /*@C
50185b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
50195b7e41feSStefano Zampini 
50205b7e41feSStefano Zampini    Not Collective
50215b7e41feSStefano Zampini 
50225b7e41feSStefano Zampini    Input Parameter:
50235b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
50245b7e41feSStefano Zampini 
50255b7e41feSStefano Zampini    Output Parameter:
50265b7e41feSStefano Zampini .   a - pointer to the device data
50275b7e41feSStefano Zampini 
50285b7e41feSStefano Zampini    Level: developer
50295b7e41feSStefano Zampini 
50305b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
50315b7e41feSStefano Zampini 
5032db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()`
50335b7e41feSStefano Zampini @*/
5034039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
5035039c6fbaSStefano Zampini {
5036039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5037039c6fbaSStefano Zampini   CsrMatrix          *csr;
5038039c6fbaSStefano Zampini 
5039039c6fbaSStefano Zampini   PetscFunctionBegin;
5040039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5041039c6fbaSStefano Zampini   PetscValidPointer(a,2);
5042039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5043aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
50449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
504528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5046039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
504728b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5048039c6fbaSStefano Zampini   *a = csr->values->data().get();
5049039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
50509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5051039c6fbaSStefano Zampini   PetscFunctionReturn(0);
5052039c6fbaSStefano Zampini }
50535b7e41feSStefano Zampini /*@C
50545b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
5055039c6fbaSStefano Zampini 
50565b7e41feSStefano Zampini    Not Collective
50575b7e41feSStefano Zampini 
50585b7e41feSStefano Zampini    Input Parameter:
50595b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
50605b7e41feSStefano Zampini 
50615b7e41feSStefano Zampini    Output Parameter:
50625b7e41feSStefano Zampini .   a - pointer to the device data
50635b7e41feSStefano Zampini 
50645b7e41feSStefano Zampini    Level: developer
50655b7e41feSStefano Zampini 
5066db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`
50675b7e41feSStefano Zampini @*/
5068039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
5069039c6fbaSStefano Zampini {
5070039c6fbaSStefano Zampini   PetscFunctionBegin;
5071039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5072039c6fbaSStefano Zampini   PetscValidPointer(a,2);
5073039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
50749566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
50759566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5076039c6fbaSStefano Zampini   *a = NULL;
5077039c6fbaSStefano Zampini   PetscFunctionReturn(0);
5078039c6fbaSStefano Zampini }
5079039c6fbaSStefano Zampini 
50805b7e41feSStefano Zampini /*@C
50815b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
50825b7e41feSStefano Zampini 
50835b7e41feSStefano Zampini    Not Collective
50845b7e41feSStefano Zampini 
50855b7e41feSStefano Zampini    Input Parameter:
50865b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
50875b7e41feSStefano Zampini 
50885b7e41feSStefano Zampini    Output Parameter:
50895b7e41feSStefano Zampini .   a - pointer to the device data
50905b7e41feSStefano Zampini 
50915b7e41feSStefano Zampini    Level: developer
50925b7e41feSStefano Zampini 
50935b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
50945b7e41feSStefano Zampini 
5095db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()`
50965b7e41feSStefano Zampini @*/
5097ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
5098ed502f03SStefano Zampini {
5099ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
5100ed502f03SStefano Zampini   CsrMatrix          *csr;
5101ed502f03SStefano Zampini 
5102ed502f03SStefano Zampini   PetscFunctionBegin;
5103ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5104ed502f03SStefano Zampini   PetscValidPointer(a,2);
5105ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5106aed4548fSBarry Smith   PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
510728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5108ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
510928b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
5110ed502f03SStefano Zampini   *a = csr->values->data().get();
5111039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
51129566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
5113ed502f03SStefano Zampini   PetscFunctionReturn(0);
5114ed502f03SStefano Zampini }
5115ed502f03SStefano Zampini 
51165b7e41feSStefano Zampini /*@C
51175b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
51185b7e41feSStefano Zampini 
51195b7e41feSStefano Zampini    Not Collective
51205b7e41feSStefano Zampini 
51215b7e41feSStefano Zampini    Input Parameter:
51225b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
51235b7e41feSStefano Zampini 
51245b7e41feSStefano Zampini    Output Parameter:
51255b7e41feSStefano Zampini .   a - pointer to the device data
51265b7e41feSStefano Zampini 
51275b7e41feSStefano Zampini    Level: developer
51285b7e41feSStefano Zampini 
5129db781477SPatrick Sanan .seealso: `MatSeqAIJCUSPARSEGetArrayWrite()`
51305b7e41feSStefano Zampini @*/
5131ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
5132ed502f03SStefano Zampini {
5133ed502f03SStefano Zampini   PetscFunctionBegin;
5134ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5135ed502f03SStefano Zampini   PetscValidPointer(a,2);
5136ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
51379566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
51389566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
5139ed502f03SStefano Zampini   *a = NULL;
5140ed502f03SStefano Zampini   PetscFunctionReturn(0);
5141ed502f03SStefano Zampini }
5142ed502f03SStefano Zampini 
5143ed502f03SStefano Zampini struct IJCompare4
5144ed502f03SStefano Zampini {
5145ed502f03SStefano Zampini   __host__ __device__
51462ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
5147ed502f03SStefano Zampini   {
5148ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
5149ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
5150ed502f03SStefano Zampini     return false;
5151ed502f03SStefano Zampini   }
5152ed502f03SStefano Zampini };
5153ed502f03SStefano Zampini 
51548909a122SStefano Zampini struct Shift
51558909a122SStefano Zampini {
5156ed502f03SStefano Zampini   int _shift;
5157ed502f03SStefano Zampini 
5158ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
5159ed502f03SStefano Zampini   __host__ __device__
5160ed502f03SStefano Zampini   inline int operator() (const int &c)
5161ed502f03SStefano Zampini   {
5162ed502f03SStefano Zampini     return c + _shift;
5163ed502f03SStefano Zampini   }
5164ed502f03SStefano Zampini };
5165ed502f03SStefano Zampini 
5166ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
5167ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
5168ed502f03SStefano Zampini {
5169ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
5170ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
5171ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
5172ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
5173ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
5174ed502f03SStefano Zampini   cusparseStatus_t             stat;
5175ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
5176ed502f03SStefano Zampini 
5177ed502f03SStefano Zampini   PetscFunctionBegin;
5178ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
5179ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
5180ed502f03SStefano Zampini   PetscValidPointer(C,4);
5181ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
5182ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
51835f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
518408401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
5185aed4548fSBarry Smith   PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5186aed4548fSBarry Smith   PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
5187ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
5188ed502f03SStefano Zampini     m     = A->rmap->n;
5189ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
51909566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,C));
51919566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C,m,n,m,n));
51929566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
5193ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
5194ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
5195ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
5196ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
5197ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
5198ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
5199ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
5200ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
5201ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
5202ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
5203ed502f03SStefano Zampini     Ccusp->nrows    = m;
5204ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
5205ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
5206ed502f03SStefano Zampini     Ccsr->num_rows  = m;
5207ed502f03SStefano Zampini     Ccsr->num_cols  = n;
52089566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
52099566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
52109566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
52119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
52129566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
52139566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
52149566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
52159566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
52169566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
52179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
52189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
521928b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
522028b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5221ed502f03SStefano Zampini 
5222ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
5223ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5224ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
5225ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
5226ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
5227ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
5228ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
5229ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
5230ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
5231ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
5232ed502f03SStefano Zampini     if (c->nz) {
52332ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
52342ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
52352ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
52362ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
52372ed87e7eSStefano Zampini 
5238ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
5239ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
5240ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
5241ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
52429566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
5243ed502f03SStefano Zampini         }
52442ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
52452ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
5246ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
5247ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
5248ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
5249ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
52509566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
5251ed502f03SStefano Zampini         }
52522ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
52532ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
52549566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
52552ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
52562ed87e7eSStefano Zampini                               Aroff->data().get(),
52572ed87e7eSStefano Zampini                               Annz,
52582ed87e7eSStefano Zampini                               m,
52592ed87e7eSStefano Zampini                               Acoo->data().get(),
52609566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
5261ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
52622ed87e7eSStefano Zampini                               Broff->data().get(),
5263ed502f03SStefano Zampini                               Bnnz,
5264ed502f03SStefano Zampini                               m,
52652ed87e7eSStefano Zampini                               Bcoo->data().get(),
52669566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
52672ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
52682ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
52692ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
52708909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
5271ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
5272ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
52738909a122SStefano Zampini #else
52748909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
52758909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
52768909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
52778909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
52788909a122SStefano Zampini #endif
52792ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
52802ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
52812ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
52822ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
52832ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
52842ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
5285ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
5286ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
5287ed502f03SStefano Zampini       thrust::advance(p2,Annz);
52882ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
52898909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
52908909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
52918909a122SStefano Zampini #endif
52922ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
52932ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
52942ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
52952ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
52962ed87e7eSStefano Zampini #else
52972ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
52982ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
52992ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
53002ed87e7eSStefano Zampini #endif
5301ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
53022ed87e7eSStefano Zampini                               Ccoo->data().get(),
5303ed502f03SStefano Zampini                               c->nz,
5304ed502f03SStefano Zampini                               m,
5305ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
53069566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
53079566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
53082ed87e7eSStefano Zampini       delete wPerm;
53092ed87e7eSStefano Zampini       delete Acoo;
53102ed87e7eSStefano Zampini       delete Bcoo;
53112ed87e7eSStefano Zampini       delete Ccoo;
5312ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5313ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
5314ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
5315ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
53169566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5317ed502f03SStefano Zampini #endif
53181a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
53199566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
53209566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
5321ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5322ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
5323ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
5324ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5325ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5326ed502f03SStefano Zampini 
53271a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
53281a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
5329a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
5330ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
5331ed502f03SStefano Zampini         CmatT->mat = CcsrT;
5332ed502f03SStefano Zampini         CcsrT->num_rows = n;
5333ed502f03SStefano Zampini         CcsrT->num_cols = m;
5334ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
5335ed502f03SStefano Zampini 
5336ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
5337ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
5338ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
5339ed502f03SStefano Zampini 
53409566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
5341ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
5342ed502f03SStefano Zampini         if (AT) {
5343ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
5344ed502f03SStefano Zampini           thrust::advance(rT,-1);
5345ed502f03SStefano Zampini         }
5346ed502f03SStefano Zampini         if (BT) {
5347ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
5348ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
5349ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
5350ed502f03SStefano Zampini         }
5351ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
5352ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
5353ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
5354ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
5355ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5356ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
53579566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
5358ed502f03SStefano Zampini 
53599566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
53609566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
53619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
53629566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
53639566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
53649566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
53659566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
53669566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
53679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
5368ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
5369ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
5370ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
5371ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
53729566063dSJacob Faibussowitsch                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
5373ed502f03SStefano Zampini #endif
5374ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
5375ed502f03SStefano Zampini       }
5376ed502f03SStefano Zampini     }
5377ed502f03SStefano Zampini 
5378ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
5379ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
5380ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
53819566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m+1,&c->i));
53829566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->j));
5383ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
5384ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
5385ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
5386ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
5387ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
53889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
53899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5390ed502f03SStefano Zampini     } else {
53919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
53929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
5393ed502f03SStefano Zampini     }
53949566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
53959566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->ilen));
53969566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->imax));
5397ed502f03SStefano Zampini     c->maxnz = c->nz;
5398ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
5399ed502f03SStefano Zampini     c->rmax = 0;
5400ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
5401ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
5402ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
5403ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
5404ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
5405ed502f03SStefano Zampini     }
54069566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
54079566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->a));
5408ed502f03SStefano Zampini     (*C)->nonzerostate++;
54099566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
54109566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
5411ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
5412ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
5413ed502f03SStefano Zampini   } else {
541408401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
5415ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
5416ed502f03SStefano Zampini     if (c->nz) {
5417ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
54185f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
5419aed4548fSBarry Smith       PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
542008401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
54219566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
54229566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
54235f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
54245f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
5425ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
5426ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
5427ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
5428aed4548fSBarry Smith       PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
5429aed4548fSBarry Smith       PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
5430aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
5431aed4548fSBarry Smith       PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
54325f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
5433ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
5434ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
54359566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
5436ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
5437ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
5438ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
5439ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5440ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
5441ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
5442ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
5443ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
5444ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
5445ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
54469566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
54471a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
54485f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
5449ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
5450ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
5451ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
5452ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
5453ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
5454ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
5455ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
54561a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
5457ed502f03SStefano Zampini       }
54589566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
5459ed502f03SStefano Zampini     }
5460ed502f03SStefano Zampini   }
54619566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
5462ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
5463ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
5464ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
5465ed502f03SStefano Zampini   PetscFunctionReturn(0);
5466ed502f03SStefano Zampini }
5467c215019aSStefano Zampini 
5468c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
5469c215019aSStefano Zampini {
5470c215019aSStefano Zampini   bool              dmem;
5471c215019aSStefano Zampini   const PetscScalar *av;
5472c215019aSStefano Zampini 
5473c215019aSStefano Zampini   PetscFunctionBegin;
5474c215019aSStefano Zampini   dmem = isCudaMem(v);
54759566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
5476c215019aSStefano Zampini   if (n && idx) {
5477c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
5478c215019aSStefano Zampini     widx.assign(idx,idx+n);
54799566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
5480c215019aSStefano Zampini 
5481c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
5482c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
5483c215019aSStefano Zampini     if (dmem) {
5484c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
5485c215019aSStefano Zampini     } else {
5486c215019aSStefano Zampini       w = new THRUSTARRAY(n);
5487c215019aSStefano Zampini       dv = w->data();
5488c215019aSStefano Zampini     }
5489c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
5490c215019aSStefano Zampini 
5491c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
5492c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
5493c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5494c215019aSStefano Zampini     if (w) {
54959566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
5496c215019aSStefano Zampini     }
5497c215019aSStefano Zampini     delete w;
5498c215019aSStefano Zampini   } else {
54999566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
5500c215019aSStefano Zampini   }
55019566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
55029566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
5503c215019aSStefano Zampini   PetscFunctionReturn(0);
5504c215019aSStefano Zampini }
5505