xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 08401ef684002a709c6d3db98a0c9f54a8bcf1ec)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94c215019aSStefano Zampini 
95ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
969ae82921SPaul Mullowney {
979ae82921SPaul Mullowney   PetscFunctionBegin;
989ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
999ae82921SPaul Mullowney   PetscFunctionReturn(0);
1009ae82921SPaul Mullowney }
1019ae82921SPaul Mullowney 
102c708e6cdSJed Brown /*MC
103087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
109c708e6cdSJed Brown 
1109ae82921SPaul Mullowney   Level: beginner
111c708e6cdSJed Brown 
1123ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
113c708e6cdSJed Brown M*/
1149ae82921SPaul Mullowney 
11542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1169ae82921SPaul Mullowney {
117bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1189ae82921SPaul Mullowney 
1199ae82921SPaul Mullowney   PetscFunctionBegin;
1209566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1219566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
1222c7c0729SBarry Smith   (*B)->factortype = ftype;
1239566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
1242205254eSKarl Rupp 
1259566063dSJacob Faibussowitsch   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1279566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1289c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1299ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1309ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1319c1083e7SRichard Tran Mills     } else {
1329c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1339c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1349c1083e7SRichard Tran Mills     }
1359566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1369566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1379566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1399c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
140087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1429c1083e7SRichard Tran Mills     } else {
1439c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1449c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1459c1083e7SRichard Tran Mills     }
1469566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1479566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1489ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149bc3f50f2SPaul Mullowney 
1509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1514ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
1539ae82921SPaul Mullowney   PetscFunctionReturn(0);
1549ae82921SPaul Mullowney }
1559ae82921SPaul Mullowney 
156bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157ca45077fSPaul Mullowney {
158aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1596e111a19SKarl Rupp 
160ca45077fSPaul Mullowney   PetscFunctionBegin;
161ca45077fSPaul Mullowney   switch (op) {
162e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
163aa372e3fSPaul Mullowney     cusparsestruct->format = format;
164ca45077fSPaul Mullowney     break;
165e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
166aa372e3fSPaul Mullowney     cusparsestruct->format = format;
167ca45077fSPaul Mullowney     break;
168ca45077fSPaul Mullowney   default:
16998921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170ca45077fSPaul Mullowney   }
171ca45077fSPaul Mullowney   PetscFunctionReturn(0);
172ca45077fSPaul Mullowney }
1739ae82921SPaul Mullowney 
174e057df02SPaul Mullowney /*@
175e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
177aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
178e057df02SPaul Mullowney    Not Collective
179e057df02SPaul Mullowney 
180e057df02SPaul Mullowney    Input Parameters:
1818468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
18236d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1832692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184e057df02SPaul Mullowney 
185e057df02SPaul Mullowney    Output Parameter:
186e057df02SPaul Mullowney 
187e057df02SPaul Mullowney    Level: intermediate
188e057df02SPaul Mullowney 
1898468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
190e057df02SPaul Mullowney @*/
191e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192e057df02SPaul Mullowney {
193e057df02SPaul Mullowney   PetscFunctionBegin;
194e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
196e057df02SPaul Mullowney   PetscFunctionReturn(0);
197e057df02SPaul Mullowney }
198e057df02SPaul Mullowney 
199365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200365b711fSMark Adams {
201365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202365b711fSMark Adams 
203365b711fSMark Adams   PetscFunctionBegin;
204365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
205365b711fSMark Adams   PetscFunctionReturn(0);
206365b711fSMark Adams }
207365b711fSMark Adams 
208365b711fSMark Adams /*@
209365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210365b711fSMark Adams 
211365b711fSMark Adams    Input Parameters:
212365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
213365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
214365b711fSMark Adams 
215365b711fSMark Adams    Output Parameter:
216365b711fSMark Adams 
217365b711fSMark Adams    Notes:
218365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
219365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221365b711fSMark Adams 
222365b711fSMark Adams    Level: intermediate
223365b711fSMark Adams 
224365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
225365b711fSMark Adams @*/
226365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227365b711fSMark Adams {
228365b711fSMark Adams   PetscFunctionBegin;
229365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
231365b711fSMark Adams   PetscFunctionReturn(0);
232365b711fSMark Adams }
233365b711fSMark Adams 
2341a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235e6e9a74fSStefano Zampini {
236e6e9a74fSStefano Zampini   PetscFunctionBegin;
2371a2c6b5cSJunchao Zhang   switch (op) {
2381a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2391a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2409566063dSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2411a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2421a2c6b5cSJunchao Zhang       break;
2431a2c6b5cSJunchao Zhang     default:
2449566063dSJacob Faibussowitsch       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
2451a2c6b5cSJunchao Zhang       break;
246e6e9a74fSStefano Zampini   }
247e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
248e6e9a74fSStefano Zampini }
249e6e9a74fSStefano Zampini 
250bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251bddcd29dSMark Adams 
252bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253bddcd29dSMark Adams {
254bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
256bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
257365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258bddcd29dSMark Adams 
259bddcd29dSMark Adams   PetscFunctionBegin;
2609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2619566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
263bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2649566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
2659566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
266bddcd29dSMark Adams   if (row_identity && col_identity) {
267365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
268bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
269bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
270365b711fSMark Adams     }
271bddcd29dSMark Adams     B->ops->matsolve = NULL;
272bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
273bddcd29dSMark Adams   } else {
274365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
275bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
276bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
277365b711fSMark Adams     }
278bddcd29dSMark Adams     B->ops->matsolve = NULL;
279bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
280bddcd29dSMark Adams   }
281bddcd29dSMark Adams 
282bddcd29dSMark Adams   /* get the triangular factors */
283365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
2849566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
285365b711fSMark Adams   }
286bddcd29dSMark Adams   PetscFunctionReturn(0);
287bddcd29dSMark Adams }
288bddcd29dSMark Adams 
2894416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2909ae82921SPaul Mullowney {
2919ae82921SPaul Mullowney   PetscErrorCode           ierr;
292e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2939ae82921SPaul Mullowney   PetscBool                flg;
294a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2956e111a19SKarl Rupp 
2969ae82921SPaul Mullowney   PetscFunctionBegin;
2979566063dSJacob Faibussowitsch   PetscCall(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options"));
2989ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
299e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
3009566063dSJacob Faibussowitsch                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
3019566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
302afb2bd1cSJunchao Zhang 
3034c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
3049566063dSJacob Faibussowitsch                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);PetscCall(ierr);
3059566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
3069566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
3079566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
308afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
309afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
3109566063dSJacob Faibussowitsch                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);PetscCall(ierr);
311afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
3128efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3132c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
314a435da06SStefano Zampini #else
3152c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
316a435da06SStefano Zampini #endif
317afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
3189566063dSJacob Faibussowitsch                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);PetscCall(ierr);
3192c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
320afb2bd1cSJunchao Zhang 
321afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
3229566063dSJacob Faibussowitsch                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);PetscCall(ierr);
3232c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
324afb2bd1cSJunchao Zhang    #endif
3254c87dfd4SPaul Mullowney   }
3269566063dSJacob Faibussowitsch   PetscCall(PetscOptionsTail());
3279ae82921SPaul Mullowney   PetscFunctionReturn(0);
3289ae82921SPaul Mullowney }
3299ae82921SPaul Mullowney 
3306fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3319ae82921SPaul Mullowney {
332da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3339ae82921SPaul Mullowney 
3349ae82921SPaul Mullowney   PetscFunctionBegin;
3359566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3369566063dSJacob Faibussowitsch   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3379ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3389ae82921SPaul Mullowney   PetscFunctionReturn(0);
3399ae82921SPaul Mullowney }
3409ae82921SPaul Mullowney 
3416fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3429ae82921SPaul Mullowney {
343da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3449ae82921SPaul Mullowney 
3459ae82921SPaul Mullowney   PetscFunctionBegin;
3469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3479566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3489ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3499ae82921SPaul Mullowney   PetscFunctionReturn(0);
3509ae82921SPaul Mullowney }
3519ae82921SPaul Mullowney 
352087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
353087f3262SPaul Mullowney {
354da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
355087f3262SPaul Mullowney 
356087f3262SPaul Mullowney   PetscFunctionBegin;
3579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3589566063dSJacob Faibussowitsch   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
359087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
360087f3262SPaul Mullowney   PetscFunctionReturn(0);
361087f3262SPaul Mullowney }
362087f3262SPaul Mullowney 
363087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
364087f3262SPaul Mullowney {
365da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
366087f3262SPaul Mullowney 
367087f3262SPaul Mullowney   PetscFunctionBegin;
3689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3699566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
370087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
371087f3262SPaul Mullowney   PetscFunctionReturn(0);
372087f3262SPaul Mullowney }
373087f3262SPaul Mullowney 
374087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3759ae82921SPaul Mullowney {
3769ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3779ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3789ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
379aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3809ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3819ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3829ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3839ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
3849ae82921SPaul Mullowney 
3859ae82921SPaul Mullowney   PetscFunctionBegin;
386cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
387c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3889ae82921SPaul Mullowney     try {
3899ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3909ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
391da79fbbcSStefano Zampini       if (!loTriFactor) {
3922cbc15d9SMark         PetscScalar                       *AALo;
3932cbc15d9SMark 
3949566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
3989566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
3999ae82921SPaul Mullowney 
4009ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4019ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4029ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4039ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4049ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4059ae82921SPaul Mullowney         v        = aa;
4069ae82921SPaul Mullowney         vi       = aj;
4079ae82921SPaul Mullowney         offset   = 1;
4089ae82921SPaul Mullowney         rowOffset= 1;
4099ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4109ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
411e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4129ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4139ae82921SPaul Mullowney           rowOffset += nz+1;
4149ae82921SPaul Mullowney 
4159566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
4169566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
4179ae82921SPaul Mullowney 
4189ae82921SPaul Mullowney           offset      += nz;
4199ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4209ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4219ae82921SPaul Mullowney           offset      += 1;
4229ae82921SPaul Mullowney 
4239ae82921SPaul Mullowney           v  += nz;
4249ae82921SPaul Mullowney           vi += nz;
4259ae82921SPaul Mullowney         }
4262205254eSKarl Rupp 
427aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4289566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
429da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
430aa372e3fSPaul Mullowney         /* Create the matrix description */
4319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
4329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4331b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4349566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
435afb2bd1cSJunchao Zhang        #else
4369566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
437afb2bd1cSJunchao Zhang        #endif
4389566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
4399566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
440aa372e3fSPaul Mullowney 
441aa372e3fSPaul Mullowney         /* set the operation */
442aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
443aa372e3fSPaul Mullowney 
444aa372e3fSPaul Mullowney         /* set the matrix */
445aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
446aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
447aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
448aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
449aa372e3fSPaul Mullowney 
450aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
451aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
452aa372e3fSPaul Mullowney 
453aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
454aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
455aa372e3fSPaul Mullowney 
456aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
457aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
458aa372e3fSPaul Mullowney 
459afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4609566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
4619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
4621b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
464afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
465afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
466afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
4675f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
4689566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
469afb2bd1cSJunchao Zhang       #endif
470afb2bd1cSJunchao Zhang 
471aa372e3fSPaul Mullowney         /* perform the solve analysis */
4729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
473aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
474aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
475d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
4761b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
477d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
4785f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
479d49cd2b7SBarry Smith                                          #else
4805f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
481afb2bd1cSJunchao Zhang                                          #endif
4829566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4839566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
484aa372e3fSPaul Mullowney 
485da79fbbcSStefano Zampini         /* assign the pointer */
486aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4872cbc15d9SMark         loTriFactor->AA_h = AALo;
4889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4899566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4909566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
491da79fbbcSStefano Zampini       } else { /* update values only */
4922cbc15d9SMark         if (!loTriFactor->AA_h) {
4939566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
4942cbc15d9SMark         }
495da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4962cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
497da79fbbcSStefano Zampini         v        = aa;
498da79fbbcSStefano Zampini         vi       = aj;
499da79fbbcSStefano Zampini         offset   = 1;
500da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
501da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5029566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
503da79fbbcSStefano Zampini           offset      += nz;
5042cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
505da79fbbcSStefano Zampini           offset      += 1;
506da79fbbcSStefano Zampini           v  += nz;
507da79fbbcSStefano Zampini         }
5082cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
5099566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
510da79fbbcSStefano Zampini       }
5119ae82921SPaul Mullowney     } catch(char *ex) {
51298921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5139ae82921SPaul Mullowney     }
5149ae82921SPaul Mullowney   }
5159ae82921SPaul Mullowney   PetscFunctionReturn(0);
5169ae82921SPaul Mullowney }
5179ae82921SPaul Mullowney 
518087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5199ae82921SPaul Mullowney {
5209ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5219ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5229ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
523aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5249ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5259ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5269ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5279ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5289ae82921SPaul Mullowney 
5299ae82921SPaul Mullowney   PetscFunctionBegin;
530cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
531c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5329ae82921SPaul Mullowney     try {
5339ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5349ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
535da79fbbcSStefano Zampini       if (!upTriFactor) {
5362cbc15d9SMark         PetscScalar *AAUp;
5372cbc15d9SMark 
5389566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
5392cbc15d9SMark 
5409ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5419566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
5429566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
5439ae82921SPaul Mullowney 
5449ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5459ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5469ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5479ae82921SPaul Mullowney         offset = nzUpper;
5489ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5499ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5509ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5519ae82921SPaul Mullowney 
552e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5539ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5549ae82921SPaul Mullowney 
555e057df02SPaul Mullowney           /* decrement the offset */
5569ae82921SPaul Mullowney           offset -= (nz+1);
5579ae82921SPaul Mullowney 
558e057df02SPaul Mullowney           /* first, set the diagonal elements */
5599ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
56009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5619ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5629ae82921SPaul Mullowney 
5639566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
5649566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
5659ae82921SPaul Mullowney         }
5662205254eSKarl Rupp 
567aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5689566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
569da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5702205254eSKarl Rupp 
571aa372e3fSPaul Mullowney         /* Create the matrix description */
5729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5739566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5741b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
5759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
576afb2bd1cSJunchao Zhang        #else
5779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
578afb2bd1cSJunchao Zhang        #endif
5799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5809566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
581aa372e3fSPaul Mullowney 
582aa372e3fSPaul Mullowney         /* set the operation */
583aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
584aa372e3fSPaul Mullowney 
585aa372e3fSPaul Mullowney         /* set the matrix */
586aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
587aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
589aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
590aa372e3fSPaul Mullowney 
591aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
592aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
593aa372e3fSPaul Mullowney 
594aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
595aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
596aa372e3fSPaul Mullowney 
597aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
598aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
599aa372e3fSPaul Mullowney 
600afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6019566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
6029566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
6031b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
6049566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
605afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
606afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
607afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
6085f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
6099566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
610afb2bd1cSJunchao Zhang       #endif
611afb2bd1cSJunchao Zhang 
612aa372e3fSPaul Mullowney         /* perform the solve analysis */
6139566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
614aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
615aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
616d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
6171b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
618d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
6195f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
620d49cd2b7SBarry Smith                                          #else
6215f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
622afb2bd1cSJunchao Zhang                                          #endif
6239566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6249566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
625aa372e3fSPaul Mullowney 
626da79fbbcSStefano Zampini         /* assign the pointer */
627aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6282cbc15d9SMark         upTriFactor->AA_h = AAUp;
6299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
6309566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
6319566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
632da79fbbcSStefano Zampini       } else {
6332cbc15d9SMark         if (!upTriFactor->AA_h) {
6349566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
6352cbc15d9SMark         }
636da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
637da79fbbcSStefano Zampini         offset = nzUpper;
638da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
639da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
640da79fbbcSStefano Zampini 
641da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
642da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
643da79fbbcSStefano Zampini 
644da79fbbcSStefano Zampini           /* decrement the offset */
645da79fbbcSStefano Zampini           offset -= (nz+1);
646da79fbbcSStefano Zampini 
647da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6482cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6499566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
650da79fbbcSStefano Zampini         }
6512cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
6529566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
653da79fbbcSStefano Zampini       }
6549ae82921SPaul Mullowney     } catch(char *ex) {
65598921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6569ae82921SPaul Mullowney     }
6579ae82921SPaul Mullowney   }
6589ae82921SPaul Mullowney   PetscFunctionReturn(0);
6599ae82921SPaul Mullowney }
6609ae82921SPaul Mullowney 
661087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6629ae82921SPaul Mullowney {
6639ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6659ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6669ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6679ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6689ae82921SPaul Mullowney 
6699ae82921SPaul Mullowney   PetscFunctionBegin;
67028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
6719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
6729566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
6732205254eSKarl Rupp 
674da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
675aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6769ae82921SPaul Mullowney 
677c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
678e057df02SPaul Mullowney   /* lower triangular indices */
6799566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
680da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
681da79fbbcSStefano Zampini     const PetscInt *r;
682da79fbbcSStefano Zampini 
6839566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow,&r));
684aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
685aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6869566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow,&r));
6879566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
688da79fbbcSStefano Zampini   }
6899ae82921SPaul Mullowney 
690e057df02SPaul Mullowney   /* upper triangular indices */
6919566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
692da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
693da79fbbcSStefano Zampini     const PetscInt *c;
694da79fbbcSStefano Zampini 
6959566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol,&c));
696aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
697aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6989566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol,&c));
6999566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
700da79fbbcSStefano Zampini   }
7019ae82921SPaul Mullowney   PetscFunctionReturn(0);
7029ae82921SPaul Mullowney }
7039ae82921SPaul Mullowney 
704087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
705087f3262SPaul Mullowney {
706087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
707087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
708aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
709aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
710087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
711087f3262SPaul Mullowney   PetscScalar                       *AAUp;
712087f3262SPaul Mullowney   PetscScalar                       *AALo;
713087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
714087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
715087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
716087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
717087f3262SPaul Mullowney 
718087f3262SPaul Mullowney   PetscFunctionBegin;
719cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
720c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
721087f3262SPaul Mullowney     try {
7229566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
7239566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
724da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
725087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
7279566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
728087f3262SPaul Mullowney 
729087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
730087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
731087f3262SPaul Mullowney         AiUp[n]=nzUpper;
732087f3262SPaul Mullowney         offset = 0;
733087f3262SPaul Mullowney         for (i=0; i<n; i++) {
734087f3262SPaul Mullowney           /* set the pointers */
735087f3262SPaul Mullowney           v  = aa + ai[i];
736087f3262SPaul Mullowney           vj = aj + ai[i];
737087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
738087f3262SPaul Mullowney 
739087f3262SPaul Mullowney           /* first, set the diagonal elements */
740087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
74109f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
742087f3262SPaul Mullowney           AiUp[i]      = offset;
74309f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
744087f3262SPaul Mullowney 
745087f3262SPaul Mullowney           offset+=1;
746087f3262SPaul Mullowney           if (nz>0) {
7479566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
7489566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
749087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
750087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
751087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
752087f3262SPaul Mullowney             }
753087f3262SPaul Mullowney             offset+=nz;
754087f3262SPaul Mullowney           }
755087f3262SPaul Mullowney         }
756087f3262SPaul Mullowney 
757aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
7589566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
759da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
760087f3262SPaul Mullowney 
761aa372e3fSPaul Mullowney         /* Create the matrix description */
7629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
7639566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
7641b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
7659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
766afb2bd1cSJunchao Zhang        #else
7679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
768afb2bd1cSJunchao Zhang        #endif
7699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
7709566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
771087f3262SPaul Mullowney 
772aa372e3fSPaul Mullowney         /* set the matrix */
773aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
774aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
775aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
776aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
777aa372e3fSPaul Mullowney 
778aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
779aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
780aa372e3fSPaul Mullowney 
781aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
782aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
783aa372e3fSPaul Mullowney 
784aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
785aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
786aa372e3fSPaul Mullowney 
787afb2bd1cSJunchao Zhang         /* set the operation */
788afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
789afb2bd1cSJunchao Zhang 
790afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7919566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
7929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
7931b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
7949566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
795afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
796afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
797afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
7985f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
7999566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
800afb2bd1cSJunchao Zhang       #endif
801afb2bd1cSJunchao Zhang 
802aa372e3fSPaul Mullowney         /* perform the solve analysis */
8039566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
804aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
805aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
806d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
8071b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
808d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
8095f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
810d49cd2b7SBarry Smith                                          #else
8115f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
812afb2bd1cSJunchao Zhang                                          #endif
8139566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8149566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
815aa372e3fSPaul Mullowney 
816da79fbbcSStefano Zampini         /* assign the pointer */
817aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
818aa372e3fSPaul Mullowney 
819aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8209566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
821da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
822aa372e3fSPaul Mullowney 
823aa372e3fSPaul Mullowney         /* Create the matrix description */
8249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8259566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8261b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8279566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
828afb2bd1cSJunchao Zhang        #else
8299566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
830afb2bd1cSJunchao Zhang        #endif
8319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8329566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
833aa372e3fSPaul Mullowney 
834aa372e3fSPaul Mullowney         /* set the operation */
835aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
836aa372e3fSPaul Mullowney 
837aa372e3fSPaul Mullowney         /* set the matrix */
838aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
839aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
840aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
841aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
842aa372e3fSPaul Mullowney 
843aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
844aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
845aa372e3fSPaul Mullowney 
846aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
847aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
848aa372e3fSPaul Mullowney 
849aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
850aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
851aa372e3fSPaul Mullowney 
852afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8539566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
8549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
8551b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
857afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
858afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
859afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
8605f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
8619566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
862afb2bd1cSJunchao Zhang       #endif
863afb2bd1cSJunchao Zhang 
864aa372e3fSPaul Mullowney         /* perform the solve analysis */
8659566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
866aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
867aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
868d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
8691b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
870d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
8715f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
872d49cd2b7SBarry Smith                                          #else
8735f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
874afb2bd1cSJunchao Zhang                                          #endif
8759566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8769566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
877aa372e3fSPaul Mullowney 
878da79fbbcSStefano Zampini         /* assign the pointer */
879aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
880087f3262SPaul Mullowney 
8819566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
8829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
8839566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
884da79fbbcSStefano Zampini       } else {
885da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
886da79fbbcSStefano Zampini         offset = 0;
887da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
888da79fbbcSStefano Zampini           /* set the pointers */
889da79fbbcSStefano Zampini           v  = aa + ai[i];
890da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
891da79fbbcSStefano Zampini 
892da79fbbcSStefano Zampini           /* first, set the diagonal elements */
893da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
894da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
895da79fbbcSStefano Zampini 
896da79fbbcSStefano Zampini           offset+=1;
897da79fbbcSStefano Zampini           if (nz>0) {
8989566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
899da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
900da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
901da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
902da79fbbcSStefano Zampini             }
903da79fbbcSStefano Zampini             offset+=nz;
904da79fbbcSStefano Zampini           }
905da79fbbcSStefano Zampini         }
90628b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
90728b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
908da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
909da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
9109566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
911da79fbbcSStefano Zampini       }
9129566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9139566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
914087f3262SPaul Mullowney     } catch(char *ex) {
91598921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
916087f3262SPaul Mullowney     }
917087f3262SPaul Mullowney   }
918087f3262SPaul Mullowney   PetscFunctionReturn(0);
919087f3262SPaul Mullowney }
920087f3262SPaul Mullowney 
921087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9229ae82921SPaul Mullowney {
923087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
924087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
925087f3262SPaul Mullowney   IS                           ip = a->row;
926087f3262SPaul Mullowney   PetscBool                    perm_identity;
927087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
928087f3262SPaul Mullowney 
929087f3262SPaul Mullowney   PetscFunctionBegin;
93028b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
9319566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
932da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
933aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
934aa372e3fSPaul Mullowney 
935da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
936da79fbbcSStefano Zampini 
937087f3262SPaul Mullowney   /* lower triangular indices */
9389566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
939087f3262SPaul Mullowney   if (!perm_identity) {
9404e4bbfaaSStefano Zampini     IS             iip;
941da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9424e4bbfaaSStefano Zampini 
9439566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
9449566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip,&irip));
9459566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip,&rip));
946aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
947aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
948aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9494e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9509566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip,&irip));
9519566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
9529566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip,&rip));
9539566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
954da79fbbcSStefano Zampini   }
955087f3262SPaul Mullowney   PetscFunctionReturn(0);
956087f3262SPaul Mullowney }
957087f3262SPaul Mullowney 
958087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
959087f3262SPaul Mullowney {
960087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
961087f3262SPaul Mullowney   IS             ip = b->row;
962087f3262SPaul Mullowney   PetscBool      perm_identity;
963087f3262SPaul Mullowney 
964087f3262SPaul Mullowney   PetscFunctionBegin;
9659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
9669566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
967ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
968087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9699566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
970087f3262SPaul Mullowney   if (perm_identity) {
971087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
972087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9734e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9744e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
975087f3262SPaul Mullowney   } else {
976087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
977087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9784e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9794e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
980087f3262SPaul Mullowney   }
981087f3262SPaul Mullowney 
982087f3262SPaul Mullowney   /* get the triangular factors */
9839566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
984087f3262SPaul Mullowney   PetscFunctionReturn(0);
985087f3262SPaul Mullowney }
9869ae82921SPaul Mullowney 
987b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
988bda325fcSPaul Mullowney {
989bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
990aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
991aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
992da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
993da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
994aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
995aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
996aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
997aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
998b175d8bbSPaul Mullowney 
999bda325fcSPaul Mullowney   PetscFunctionBegin;
1000aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10019566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1002da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1003aa372e3fSPaul Mullowney 
1004aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1005aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1006aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1007aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1008aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1009aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1010aa372e3fSPaul Mullowney 
1011aa372e3fSPaul Mullowney   /* Create the matrix description */
10129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10139566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10149566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10159566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10169566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1017aa372e3fSPaul Mullowney 
1018aa372e3fSPaul Mullowney   /* set the operation */
1019aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1020aa372e3fSPaul Mullowney 
1021aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1022aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1023afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1024afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1025aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1026afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1027afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1028afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1029aa372e3fSPaul Mullowney 
1030aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1031afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
10329566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1033afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1034afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(),
1035afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->row_offsets->data().get(),
1036afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(),
1037afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->values->data().get(),
1038afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1039afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
10405f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10419566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1042afb2bd1cSJunchao Zhang #endif
1043afb2bd1cSJunchao Zhang 
10449566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
10459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->values->data().get(),
1048aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->row_offsets->data().get(),
1049aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->column_indices->data().get(),
1050aa372e3fSPaul Mullowney                                   loTriFactorT->csrMat->values->data().get(),
1051afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1052afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1053afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
10545f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1055afb2bd1cSJunchao Zhang                                   #else
1056afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10575f80ce2aSJacob Faibussowitsch                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1058afb2bd1cSJunchao Zhang                                   #endif
10599566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10609566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1061aa372e3fSPaul Mullowney 
1062afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
10639566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
10649566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
10651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
10669566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
10705f80ce2aSJacob Faibussowitsch                                          &loTriFactorT->solveBufferSize));
10719566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1072afb2bd1cSJunchao Zhang #endif
1073afb2bd1cSJunchao Zhang 
1074afb2bd1cSJunchao Zhang   /* perform the solve analysis */
10759566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078d49cd2b7SBarry Smith                                    loTriFactorT->csrMat->column_indices->data().get(),
10791b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080d49cd2b7SBarry Smith                                    loTriFactorT->solveInfo,
10815f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1082d49cd2b7SBarry Smith                                    #else
10835f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solveInfo));
1084afb2bd1cSJunchao Zhang                                    #endif
10859566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10869566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1087aa372e3fSPaul Mullowney 
1088da79fbbcSStefano Zampini   /* assign the pointer */
1089aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1090aa372e3fSPaul Mullowney 
1091aa372e3fSPaul Mullowney   /*********************************************/
1092aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1093aa372e3fSPaul Mullowney   /*********************************************/
1094aa372e3fSPaul Mullowney 
1095aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
10969566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1097da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1098aa372e3fSPaul Mullowney 
1099aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1100aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1101aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1102aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1103aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1104aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1105aa372e3fSPaul Mullowney 
1106aa372e3fSPaul Mullowney   /* Create the matrix description */
11079566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11089566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11099566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11109566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11119566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1112aa372e3fSPaul Mullowney 
1113aa372e3fSPaul Mullowney   /* set the operation */
1114aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1115aa372e3fSPaul Mullowney 
1116aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1117aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1118afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1119afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1120aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1121afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1122afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1123afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1124aa372e3fSPaul Mullowney 
1125aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1126afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
11279566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1128afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1129afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(),
1130afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->row_offsets->data().get(),
1131afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(),
1132afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->values->data().get(),
1133afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1134afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
11355f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11369566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1137afb2bd1cSJunchao Zhang #endif
1138afb2bd1cSJunchao Zhang 
11399566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
11409566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1141aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1142aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->values->data().get(),
1143aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->row_offsets->data().get(),
1144aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->column_indices->data().get(),
1145aa372e3fSPaul Mullowney                                   upTriFactorT->csrMat->values->data().get(),
1146afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1147afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1148afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
11495f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1150afb2bd1cSJunchao Zhang                                   #else
1151afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11525f80ce2aSJacob Faibussowitsch                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1153afb2bd1cSJunchao Zhang                                  #endif
1154d49cd2b7SBarry Smith 
11559566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11569566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1157aa372e3fSPaul Mullowney 
1158afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11599566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
11609566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
11611b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
11629566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1163afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1164afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1165afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
11665f80ce2aSJacob Faibussowitsch                                          &upTriFactorT->solveBufferSize));
11679566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1168afb2bd1cSJunchao Zhang   #endif
1169afb2bd1cSJunchao Zhang 
1170afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11715f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
11729566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175d49cd2b7SBarry Smith                                    upTriFactorT->csrMat->column_indices->data().get(),
11761b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1177d49cd2b7SBarry Smith                                    upTriFactorT->solveInfo,
11785f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1179d49cd2b7SBarry Smith                                    #else
11805f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solveInfo));
1181afb2bd1cSJunchao Zhang                                    #endif
1182d49cd2b7SBarry Smith 
11839566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11849566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1185aa372e3fSPaul Mullowney 
1186da79fbbcSStefano Zampini   /* assign the pointer */
1187aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1188bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1189bda325fcSPaul Mullowney }
1190bda325fcSPaul Mullowney 
1191a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1192a49f1ed0SStefano Zampini {
1193a49f1ed0SStefano Zampini   __host__ __device__
1194a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1195a49f1ed0SStefano Zampini   {
1196a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1197a49f1ed0SStefano Zampini   }
1198a49f1ed0SStefano Zampini };
1199a49f1ed0SStefano Zampini 
12003606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1201bda325fcSPaul Mullowney {
1202aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1203a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1204bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1205bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1206aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1207b175d8bbSPaul Mullowney 
1208bda325fcSPaul Mullowney   PetscFunctionBegin;
12099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1210a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
121128b400f6SJacob Faibussowitsch   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1212a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1213*08401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12141a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
12159566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
12169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1217a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
12189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1219a49f1ed0SStefano Zampini   }
1220a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1221aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12229566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1223aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12249566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12259566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1226aa372e3fSPaul Mullowney 
1227b06137fdSPaul Mullowney     /* set alpha and beta */
12289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
12299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
12309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
12319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
12329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
12339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1234b06137fdSPaul Mullowney 
1235aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1236aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1237a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1238554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1239554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1240aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1241a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1242aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1243aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1244a3fdcf43SKarl Rupp 
1245039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
124681902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1247afb2bd1cSJunchao Zhang 
1248afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
12493606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1250afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1251afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1252afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1253afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1254afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12559566063dSJacob Faibussowitsch                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
12563606e59fSJunchao Zhang       #else
12573606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12583606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12593606e59fSJunchao Zhang 
12603606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12613606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12623606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12633606e59fSJunchao Zhang         */
12643606e59fSJunchao Zhang         if (matrixT->num_entries) {
12653606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
12663606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
12673606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
12683606e59fSJunchao Zhang                                  matrixT->values->data().get(),
12693606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
12709566063dSJacob Faibussowitsch                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
12713606e59fSJunchao Zhang 
12723606e59fSJunchao Zhang         } else {
12733606e59fSJunchao Zhang           matstructT->matDescr = NULL;
12743606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
12753606e59fSJunchao Zhang         }
12763606e59fSJunchao Zhang       #endif
1277afb2bd1cSJunchao Zhang      #endif
1278aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1279afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1280afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1281afb2bd1cSJunchao Zhang    #else
1282aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
128351c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
128451c6d536SStefano Zampini       /* First convert HYB to CSR */
1285aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1286aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1287aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1288aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1289aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1290aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1291aa372e3fSPaul Mullowney 
1292aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1293aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1294aa372e3fSPaul Mullowney                               temp->values->data().get(),
1295aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
12969566063dSJacob Faibussowitsch                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1297aa372e3fSPaul Mullowney 
1298aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1299aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1300aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1301aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1302aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1303aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1304aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1305aa372e3fSPaul Mullowney 
1306aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1307aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1308aa372e3fSPaul Mullowney                               temp->values->data().get(),
1309aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1310aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1311aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1312aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1313aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
13149566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1315aa372e3fSPaul Mullowney 
1316aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1317aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13189566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1319aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1320aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1321aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1322aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1323aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1324aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
13259566063dSJacob Faibussowitsch                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1326aa372e3fSPaul Mullowney 
1327aa372e3fSPaul Mullowney       /* assign the pointer */
1328aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13291a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1330aa372e3fSPaul Mullowney       /* delete temporaries */
1331aa372e3fSPaul Mullowney       if (tempT) {
1332aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1333aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1334aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1335aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1336087f3262SPaul Mullowney       }
1337aa372e3fSPaul Mullowney       if (temp) {
1338aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1339aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1340aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1341aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1342aa372e3fSPaul Mullowney       }
1343afb2bd1cSJunchao Zhang      #endif
1344aa372e3fSPaul Mullowney     }
1345a49f1ed0SStefano Zampini   }
1346a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1347a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1348a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
134928b400f6SJacob Faibussowitsch     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
135028b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
135128b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
135228b400f6SJacob Faibussowitsch     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
135328b400f6SJacob Faibussowitsch     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
135428b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
135528b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
135628b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1357a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1358a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1359a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
13609566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1361a49f1ed0SStefano Zampini     }
1362a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1363a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1364a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1365a49f1ed0SStefano Zampini 
1366a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1367a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1368a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1369a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1370a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1371a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1372a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1373a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1374a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1375a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1376a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1377a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
13789566063dSJacob Faibussowitsch                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
13799566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1380a49f1ed0SStefano Zampini      #endif
1381a49f1ed0SStefano Zampini 
13821a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13831a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13841a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13851a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13861a2c6b5cSJunchao Zhang 
13871a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13881a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13891a2c6b5cSJunchao Zhang         */
13901a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
13911a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
13921a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
13931a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
13941a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1395a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1396a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1397a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1398a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
13999566063dSJacob Faibussowitsch                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1400a49f1ed0SStefano Zampini                              #else
1401a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14029566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1403a49f1ed0SStefano Zampini                              #endif
14041a2c6b5cSJunchao Zhang       } else {
14051a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14061a2c6b5cSJunchao Zhang       }
14071a2c6b5cSJunchao Zhang 
1408a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1409a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1410a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
14119566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1412a49f1ed0SStefano Zampini      #endif
1413a49f1ed0SStefano Zampini     }
1414a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1415a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1416a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1417a49f1ed0SStefano Zampini   }
14189566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14199566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1420213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1421213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1422aa372e3fSPaul Mullowney   /* assign the pointer */
1423aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14241a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1425bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1426bda325fcSPaul Mullowney }
1427bda325fcSPaul Mullowney 
1428a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14296fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1430bda325fcSPaul Mullowney {
1431c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1432465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1433465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1434465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1435465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1436bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1437bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1438aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1439aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1440aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1441bda325fcSPaul Mullowney 
1442bda325fcSPaul Mullowney   PetscFunctionBegin;
1443aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1444aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
14459566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1446aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1447aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1448bda325fcSPaul Mullowney   }
1449bda325fcSPaul Mullowney 
1450bda325fcSPaul Mullowney   /* Get the GPU pointers */
14519566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
14529566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1453c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1454c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1455bda325fcSPaul Mullowney 
14569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1457aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1458a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1459c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1460c41cb2e2SAlejandro Lamas Daviña                xGPU);
1461aa372e3fSPaul Mullowney 
1462aa372e3fSPaul Mullowney   /* First, solve U */
1463aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1464afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14651b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1466afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1467afb2bd1cSJunchao Zhang                       #endif
1468afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1469aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1470aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1471aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1472aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1473d49cd2b7SBarry Smith                         xarray,
14741b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475d49cd2b7SBarry Smith                         tempGPU->data().get(),
14769566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1477d49cd2b7SBarry Smith                       #else
14789566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1479afb2bd1cSJunchao Zhang                       #endif
1480aa372e3fSPaul Mullowney 
1481aa372e3fSPaul Mullowney   /* Then, solve L */
1482aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1483afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14841b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1485afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1486afb2bd1cSJunchao Zhang                       #endif
1487afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1488aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1489aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1490aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1491aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1492d49cd2b7SBarry Smith                         tempGPU->data().get(),
14931b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1494d49cd2b7SBarry Smith                         xarray,
14959566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1496d49cd2b7SBarry Smith                       #else
14979566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1498afb2bd1cSJunchao Zhang                       #endif
1499aa372e3fSPaul Mullowney 
1500aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1501a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1502c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1503aa372e3fSPaul Mullowney                tempGPU->begin());
1504aa372e3fSPaul Mullowney 
1505aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1506a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1507bda325fcSPaul Mullowney 
1508bda325fcSPaul Mullowney   /* restore */
15099566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
15109566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
15119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15129566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1513bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1514bda325fcSPaul Mullowney }
1515bda325fcSPaul Mullowney 
15166fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1517bda325fcSPaul Mullowney {
1518465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1519465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1520bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1521bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1522aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1523aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1524aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1525bda325fcSPaul Mullowney 
1526bda325fcSPaul Mullowney   PetscFunctionBegin;
1527aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1528aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1530aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1531aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1532bda325fcSPaul Mullowney   }
1533bda325fcSPaul Mullowney 
1534bda325fcSPaul Mullowney   /* Get the GPU pointers */
15359566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
15369566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1537bda325fcSPaul Mullowney 
15389566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1539aa372e3fSPaul Mullowney   /* First, solve U */
1540aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1541afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15421b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1544afb2bd1cSJunchao Zhang                       #endif
1545afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1546aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1547aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1548aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1549aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1550d49cd2b7SBarry Smith                         barray,
15511b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552d49cd2b7SBarry Smith                         tempGPU->data().get(),
15539566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1554d49cd2b7SBarry Smith                       #else
15559566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1556afb2bd1cSJunchao Zhang                       #endif
1557aa372e3fSPaul Mullowney 
1558aa372e3fSPaul Mullowney   /* Then, solve L */
1559aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1560afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1563afb2bd1cSJunchao Zhang                       #endif
1564afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1565aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1566aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1567aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1568aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1569d49cd2b7SBarry Smith                         tempGPU->data().get(),
15701b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571d49cd2b7SBarry Smith                         xarray,
15729566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1573d49cd2b7SBarry Smith                       #else
15749566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1575afb2bd1cSJunchao Zhang                       #endif
1576bda325fcSPaul Mullowney 
1577bda325fcSPaul Mullowney   /* restore */
15789566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
15799566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
15809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15819566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1582bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1583bda325fcSPaul Mullowney }
1584bda325fcSPaul Mullowney 
15856fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15869ae82921SPaul Mullowney {
1587465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1588465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1589465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1590465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15919ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15929ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1593aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1594aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1595aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
15969ae82921SPaul Mullowney 
15979ae82921SPaul Mullowney   PetscFunctionBegin;
1598ebc8f436SDominic Meiser 
1599e057df02SPaul Mullowney   /* Get the GPU pointers */
16009566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
16019566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1602c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1603c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16049ae82921SPaul Mullowney 
16059566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1606aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1607a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1608c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16094e4bbfaaSStefano Zampini                tempGPU->begin());
1610aa372e3fSPaul Mullowney 
1611aa372e3fSPaul Mullowney   /* Next, solve L */
1612aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1613afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16141b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1616afb2bd1cSJunchao Zhang                       #endif
1617afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1618aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1619aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1620aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1621aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1622d49cd2b7SBarry Smith                         tempGPU->data().get(),
16231b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624d49cd2b7SBarry Smith                          xarray,
16259566063dSJacob Faibussowitsch                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1626d49cd2b7SBarry Smith                       #else
16279566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1628afb2bd1cSJunchao Zhang                       #endif
1629aa372e3fSPaul Mullowney 
1630aa372e3fSPaul Mullowney   /* Then, solve U */
1631aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1632afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16331b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1635afb2bd1cSJunchao Zhang                       #endif
1636afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1637aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1638aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1639aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1640d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
16411b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1642d49cd2b7SBarry Smith                         tempGPU->data().get(),
16439566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1644d49cd2b7SBarry Smith                       #else
16459566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1646afb2bd1cSJunchao Zhang                       #endif
1647d49cd2b7SBarry Smith 
16484e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1649a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16504e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16514e4bbfaaSStefano Zampini                xGPU);
16529ae82921SPaul Mullowney 
16539566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
16549566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
16559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
16579ae82921SPaul Mullowney   PetscFunctionReturn(0);
16589ae82921SPaul Mullowney }
16599ae82921SPaul Mullowney 
16606fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16619ae82921SPaul Mullowney {
1662465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1663465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16649ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16659ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1666aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1667aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1668aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
16699ae82921SPaul Mullowney 
16709ae82921SPaul Mullowney   PetscFunctionBegin;
1671e057df02SPaul Mullowney   /* Get the GPU pointers */
16729566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
16739566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
16749ae82921SPaul Mullowney 
16759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1676aa372e3fSPaul Mullowney   /* First, solve L */
1677aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1678afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16791b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1680afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1681afb2bd1cSJunchao Zhang                       #endif
1682afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1683aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1684aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1685aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1686aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1687d49cd2b7SBarry Smith                         barray,
16881b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689d49cd2b7SBarry Smith                         tempGPU->data().get(),
16909566063dSJacob Faibussowitsch                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1691d49cd2b7SBarry Smith                       #else
16929566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1693afb2bd1cSJunchao Zhang                       #endif
1694d49cd2b7SBarry Smith 
1695aa372e3fSPaul Mullowney   /* Next, solve U */
1696aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1697afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16981b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1699afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1700afb2bd1cSJunchao Zhang                       #endif
1701afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1702aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1703aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1704aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1705aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1706d49cd2b7SBarry Smith                         tempGPU->data().get(),
17071b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1708d49cd2b7SBarry Smith                         xarray,
17099566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1710d49cd2b7SBarry Smith                       #else
17119566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1712afb2bd1cSJunchao Zhang                       #endif
17139ae82921SPaul Mullowney 
17149566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
17159566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
17169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
17189ae82921SPaul Mullowney   PetscFunctionReturn(0);
17199ae82921SPaul Mullowney }
17209ae82921SPaul Mullowney 
17217e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17227e8381f9SStefano Zampini {
17237e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17247e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17257e8381f9SStefano Zampini 
17267e8381f9SStefano Zampini   PetscFunctionBegin;
17277e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17287e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17297e8381f9SStefano Zampini 
17309566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
17329566063dSJacob Faibussowitsch     PetscCallCUDA(WaitForCUDA());
17339566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
17349566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17357e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17367e8381f9SStefano Zampini   }
17377e8381f9SStefano Zampini   PetscFunctionReturn(0);
17387e8381f9SStefano Zampini }
17397e8381f9SStefano Zampini 
17407e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17417e8381f9SStefano Zampini {
17427e8381f9SStefano Zampini   PetscFunctionBegin;
17439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
174467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
174567a45760SJunchao Zhang   PetscFunctionReturn(0);
174667a45760SJunchao Zhang }
174767a45760SJunchao Zhang 
174867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
174967a45760SJunchao Zhang {
175067a45760SJunchao Zhang   PetscFunctionBegin;
17517e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
175267a45760SJunchao Zhang   *array         = NULL;
175367a45760SJunchao Zhang   PetscFunctionReturn(0);
175467a45760SJunchao Zhang }
175567a45760SJunchao Zhang 
175667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
175767a45760SJunchao Zhang {
175867a45760SJunchao Zhang   PetscFunctionBegin;
17599566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
176067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
176167a45760SJunchao Zhang   PetscFunctionReturn(0);
176267a45760SJunchao Zhang }
176367a45760SJunchao Zhang 
176467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
176567a45760SJunchao Zhang {
176667a45760SJunchao Zhang   PetscFunctionBegin;
176767a45760SJunchao Zhang   *array = NULL;
176867a45760SJunchao Zhang   PetscFunctionReturn(0);
176967a45760SJunchao Zhang }
177067a45760SJunchao Zhang 
177167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
177267a45760SJunchao Zhang {
177367a45760SJunchao Zhang   PetscFunctionBegin;
177467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
177567a45760SJunchao Zhang   PetscFunctionReturn(0);
177667a45760SJunchao Zhang }
177767a45760SJunchao Zhang 
177867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
177967a45760SJunchao Zhang {
178067a45760SJunchao Zhang   PetscFunctionBegin;
178167a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
178267a45760SJunchao Zhang   *array         = NULL;
17837e8381f9SStefano Zampini   PetscFunctionReturn(0);
17847e8381f9SStefano Zampini }
17857e8381f9SStefano Zampini 
17867ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
17877ee59b9bSJunchao Zhang {
17887ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE           *cusp;
17897ee59b9bSJunchao Zhang   CsrMatrix                    *matrix;
17907ee59b9bSJunchao Zhang 
17917ee59b9bSJunchao Zhang   PetscFunctionBegin;
17927ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
17937ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
17947ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
17957ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
17967ee59b9bSJunchao Zhang   matrix = (CsrMatrix*)cusp->mat->mat;
17977ee59b9bSJunchao Zhang 
17987ee59b9bSJunchao Zhang   if (i) {
17997ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
18007ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
18017ee59b9bSJunchao Zhang    #else
18027ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
18037ee59b9bSJunchao Zhang    #endif
18047ee59b9bSJunchao Zhang   }
18057ee59b9bSJunchao Zhang   if (j) {
18067ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
18077ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
18087ee59b9bSJunchao Zhang    #else
18097ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
18107ee59b9bSJunchao Zhang    #endif
18117ee59b9bSJunchao Zhang   }
18127ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
18137ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
18147ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
18157ee59b9bSJunchao Zhang }
18167ee59b9bSJunchao Zhang 
1817042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18189ae82921SPaul Mullowney {
1819aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18207c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18219ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1822213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1823aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1824abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
18259ae82921SPaul Mullowney 
18269ae82921SPaul Mullowney   PetscFunctionBegin;
182728b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1828c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1829a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1830a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1831afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
183285ba7357SStefano Zampini 
1833*08401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
18349566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1835afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
18369566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
18379566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
18389566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
18399566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
184034d6c7a5SJose E. Roman     } else {
1841abb89eb1SStefano Zampini       PetscInt nnz;
18429566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
18439566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
18449566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
18457c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
184681902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1847a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1848a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18499ae82921SPaul Mullowney       try {
18509ae82921SPaul Mullowney         if (a->compressedrow.use) {
18519ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
18529ae82921SPaul Mullowney           ii   = a->compressedrow.i;
18539ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
18549ae82921SPaul Mullowney         } else {
1855213423ffSJunchao Zhang           m    = A->rmap->n;
1856213423ffSJunchao Zhang           ii   = a->i;
1857e6e9a74fSStefano Zampini           ridx = NULL;
18589ae82921SPaul Mullowney         }
1859*08401ef6SPierre Jolivet         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1860abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1861abb89eb1SStefano Zampini         else nnz = a->nz;
1862*08401ef6SPierre Jolivet         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
18639ae82921SPaul Mullowney 
186485ba7357SStefano Zampini         /* create cusparse matrix */
1865abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1866aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
18679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
18689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
18699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
18709ae82921SPaul Mullowney 
18719566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
18729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
18739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
18749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
18769566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1878b06137fdSPaul Mullowney 
1879aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1880aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1881aa372e3fSPaul Mullowney           /* set the matrix */
1882afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1883afb2bd1cSJunchao Zhang           mat->num_rows = m;
1884afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1885abb89eb1SStefano Zampini           mat->num_entries = nnz;
1886afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1887afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18889ae82921SPaul Mullowney 
1889abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1890abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1891aa372e3fSPaul Mullowney 
1892abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1893abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1894aa372e3fSPaul Mullowney 
1895aa372e3fSPaul Mullowney           /* assign the pointer */
1896afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1897afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1898afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1899afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1900afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1901afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1902afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1903afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
19049566063dSJacob Faibussowitsch                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1905afb2bd1cSJunchao Zhang           }
1906afb2bd1cSJunchao Zhang          #endif
1907aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1908afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1909afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1910afb2bd1cSJunchao Zhang          #else
1911afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1912afb2bd1cSJunchao Zhang           mat->num_rows = m;
1913afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1914abb89eb1SStefano Zampini           mat->num_entries = nnz;
1915afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1916afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1917aa372e3fSPaul Mullowney 
1918abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1919abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1920aa372e3fSPaul Mullowney 
1921abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1922abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1923aa372e3fSPaul Mullowney 
1924aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
19259566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1926aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1927aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1928afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1929afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1930afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1931afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
19329566063dSJacob Faibussowitsch               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1933aa372e3fSPaul Mullowney           /* assign the pointer */
1934aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1935aa372e3fSPaul Mullowney 
1936afb2bd1cSJunchao Zhang           if (mat) {
1937afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1938afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1939afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1940afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1941087f3262SPaul Mullowney           }
1942afb2bd1cSJunchao Zhang          #endif
1943087f3262SPaul Mullowney         }
1944ca45077fSPaul Mullowney 
1945aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1946213423ffSJunchao Zhang         if (a->compressedrow.use) {
1947213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1948aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1949aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1950213423ffSJunchao Zhang           tmp = m;
1951213423ffSJunchao Zhang         } else {
1952213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1953213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1954213423ffSJunchao Zhang           tmp = 0;
1955213423ffSJunchao Zhang         }
19569566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1957aa372e3fSPaul Mullowney 
1958aa372e3fSPaul Mullowney         /* assign the pointer */
1959aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
19609ae82921SPaul Mullowney       } catch(char *ex) {
196198921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
19629ae82921SPaul Mullowney       }
19639566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
19649566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
196534d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
196634d6c7a5SJose E. Roman     }
1967abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
19689ae82921SPaul Mullowney   }
19699ae82921SPaul Mullowney   PetscFunctionReturn(0);
19709ae82921SPaul Mullowney }
19719ae82921SPaul Mullowney 
1972c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1973aa372e3fSPaul Mullowney {
1974aa372e3fSPaul Mullowney   template <typename Tuple>
1975aa372e3fSPaul Mullowney   __host__ __device__
1976aa372e3fSPaul Mullowney   void operator()(Tuple t)
1977aa372e3fSPaul Mullowney   {
1978aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1979aa372e3fSPaul Mullowney   }
1980aa372e3fSPaul Mullowney };
1981aa372e3fSPaul Mullowney 
19827e8381f9SStefano Zampini struct VecCUDAEquals
19837e8381f9SStefano Zampini {
19847e8381f9SStefano Zampini   template <typename Tuple>
19857e8381f9SStefano Zampini   __host__ __device__
19867e8381f9SStefano Zampini   void operator()(Tuple t)
19877e8381f9SStefano Zampini   {
19887e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19897e8381f9SStefano Zampini   }
19907e8381f9SStefano Zampini };
19917e8381f9SStefano Zampini 
1992e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1993e6e9a74fSStefano Zampini {
1994e6e9a74fSStefano Zampini   template <typename Tuple>
1995e6e9a74fSStefano Zampini   __host__ __device__
1996e6e9a74fSStefano Zampini   void operator()(Tuple t)
1997e6e9a74fSStefano Zampini   {
1998e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1999e6e9a74fSStefano Zampini   }
2000e6e9a74fSStefano Zampini };
2001e6e9a74fSStefano Zampini 
2002afb2bd1cSJunchao Zhang struct MatMatCusparse {
2003ccdfe979SStefano Zampini   PetscBool             cisdense;
2004ccdfe979SStefano Zampini   PetscScalar           *Bt;
2005ccdfe979SStefano Zampini   Mat                   X;
2006fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2007fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2008fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2009b4285af6SJunchao Zhang 
2010afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2011fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2012afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2013afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2014afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2015afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2016b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2017b4285af6SJunchao Zhang   void                  *dBuffer4;
2018b4285af6SJunchao Zhang   void                  *dBuffer5;
2019b4285af6SJunchao Zhang  #endif
2020fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2021fcdce8c4SStefano Zampini   void                  *mmBuffer;
2022fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2023fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2024afb2bd1cSJunchao Zhang #endif
2025afb2bd1cSJunchao Zhang };
2026ccdfe979SStefano Zampini 
2027ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2028ccdfe979SStefano Zampini {
2029ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2030ccdfe979SStefano Zampini 
2031ccdfe979SStefano Zampini   PetscFunctionBegin;
20329566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2033fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2034afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20359566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
20369566063dSJacob Faibussowitsch   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
20379566063dSJacob Faibussowitsch   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
20389566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2039b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
20409566063dSJacob Faibussowitsch   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
20419566063dSJacob Faibussowitsch   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2042b4285af6SJunchao Zhang  #endif
20439566063dSJacob Faibussowitsch   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
20449566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2045afb2bd1cSJunchao Zhang  #endif
20469566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
20479566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2048ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2049ccdfe979SStefano Zampini }
2050ccdfe979SStefano Zampini 
2051ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2052ccdfe979SStefano Zampini 
2053ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2054ccdfe979SStefano Zampini {
2055ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2056ccdfe979SStefano Zampini   Mat                          A,B;
2057afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2058ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2059ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2060ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2061ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2062ccdfe979SStefano Zampini   const PetscScalar            *barray;
2063ccdfe979SStefano Zampini   PetscScalar                  *carray;
2064ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2065ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2066ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2067ccdfe979SStefano Zampini 
2068ccdfe979SStefano Zampini   PetscFunctionBegin;
2069ccdfe979SStefano Zampini   MatCheckProduct(C,1);
207028b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2071ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2072ccdfe979SStefano Zampini   A    = product->A;
2073ccdfe979SStefano Zampini   B    = product->B;
20749566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
207528b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2076ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2077ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
207828b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
20799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2080ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2081ccdfe979SStefano Zampini   switch (product->type) {
2082ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2083ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2084ccdfe979SStefano Zampini     mat = cusp->mat;
2085ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2086ccdfe979SStefano Zampini     m   = A->rmap->n;
2087ccdfe979SStefano Zampini     n   = B->cmap->n;
2088ccdfe979SStefano Zampini     break;
2089ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20901a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2091e6e9a74fSStefano Zampini       mat = cusp->mat;
2092e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2093e6e9a74fSStefano Zampini     } else {
20949566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2095ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2096ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2097e6e9a74fSStefano Zampini     }
2098ccdfe979SStefano Zampini     m = A->cmap->n;
2099ccdfe979SStefano Zampini     n = B->cmap->n;
2100ccdfe979SStefano Zampini     break;
2101ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2102ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2103ccdfe979SStefano Zampini     mat = cusp->mat;
2104ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105ccdfe979SStefano Zampini     m   = A->rmap->n;
2106ccdfe979SStefano Zampini     n   = B->rmap->n;
2107ccdfe979SStefano Zampini     break;
2108ccdfe979SStefano Zampini   default:
210998921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2110ccdfe979SStefano Zampini   }
211128b400f6SJacob Faibussowitsch   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2112ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2113ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
21149566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
21159566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
21169566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2117afb2bd1cSJunchao Zhang 
21189566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B,&blda));
2119c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
21209566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
21219566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2122c8378d12SStefano Zampini   } else {
21239566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
21249566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C,&clda));
2125c8378d12SStefano Zampini   }
2126c8378d12SStefano Zampini 
21279566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2128afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2129afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2130a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2131afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2132fcdce8c4SStefano Zampini     size_t mmBufferSize;
21339566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2134afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
21359566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2136afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2137afb2bd1cSJunchao Zhang     }
2138c8378d12SStefano Zampini 
21399566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2140afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
21419566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2142afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2143afb2bd1cSJunchao Zhang     }
2144afb2bd1cSJunchao Zhang 
2145afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2146afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2147afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2148afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2149afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2150afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
21519566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2152afb2bd1cSJunchao Zhang     }
2153afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2154afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2155afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
21569566063dSJacob Faibussowitsch                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2157fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
21589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
21599566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2160fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2161fcdce8c4SStefano Zampini     }
2162afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2163afb2bd1cSJunchao Zhang   } else {
2164afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
21659566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
21669566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
21679566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2168afb2bd1cSJunchao Zhang   }
2169afb2bd1cSJunchao Zhang 
2170afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2171afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2172afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2173afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
21749566063dSJacob Faibussowitsch                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2175afb2bd1cSJunchao Zhang  #else
2176afb2bd1cSJunchao Zhang   PetscInt k;
2177afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2178ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2179ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2180ccdfe979SStefano Zampini     cublasStatus_t cerr;
2181ccdfe979SStefano Zampini 
21829566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2183ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2184ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2185ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2186ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
21879566063dSJacob Faibussowitsch                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2188ccdfe979SStefano Zampini     blda = B->cmap->n;
2189afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2190afb2bd1cSJunchao Zhang   } else {
2191afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2192ccdfe979SStefano Zampini   }
2193ccdfe979SStefano Zampini 
2194afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2195ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2196afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2197ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2198ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2199ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2200ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
22019566063dSJacob Faibussowitsch                            carray,clda);PetscCallCUSPARSE(stat);
2202afb2bd1cSJunchao Zhang  #endif
22039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
22049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
22059566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2206ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
22079566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22089566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2209ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
22109566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22119566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2212ccdfe979SStefano Zampini   } else {
22139566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2214ccdfe979SStefano Zampini   }
2215ccdfe979SStefano Zampini   if (mmdata->cisdense) {
22169566063dSJacob Faibussowitsch     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2217ccdfe979SStefano Zampini   }
2218ccdfe979SStefano Zampini   if (!biscuda) {
22199566063dSJacob Faibussowitsch     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2220ccdfe979SStefano Zampini   }
2221ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2222ccdfe979SStefano Zampini }
2223ccdfe979SStefano Zampini 
2224ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2225ccdfe979SStefano Zampini {
2226ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2227ccdfe979SStefano Zampini   Mat                A,B;
2228ccdfe979SStefano Zampini   PetscInt           m,n;
2229ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2230ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2231ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2232ccdfe979SStefano Zampini 
2233ccdfe979SStefano Zampini   PetscFunctionBegin;
2234ccdfe979SStefano Zampini   MatCheckProduct(C,1);
223528b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2236ccdfe979SStefano Zampini   A    = product->A;
2237ccdfe979SStefano Zampini   B    = product->B;
22389566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
223928b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2240ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2241*08401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2242ccdfe979SStefano Zampini   switch (product->type) {
2243ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2244ccdfe979SStefano Zampini     m = A->rmap->n;
2245ccdfe979SStefano Zampini     n = B->cmap->n;
2246ccdfe979SStefano Zampini     break;
2247ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2248ccdfe979SStefano Zampini     m = A->cmap->n;
2249ccdfe979SStefano Zampini     n = B->cmap->n;
2250ccdfe979SStefano Zampini     break;
2251ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2252ccdfe979SStefano Zampini     m = A->rmap->n;
2253ccdfe979SStefano Zampini     n = B->rmap->n;
2254ccdfe979SStefano Zampini     break;
2255ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2256ccdfe979SStefano Zampini     m = B->cmap->n;
2257ccdfe979SStefano Zampini     n = B->cmap->n;
2258ccdfe979SStefano Zampini     break;
2259ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2260ccdfe979SStefano Zampini     m = B->rmap->n;
2261ccdfe979SStefano Zampini     n = B->rmap->n;
2262ccdfe979SStefano Zampini     break;
2263ccdfe979SStefano Zampini   default:
226498921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2265ccdfe979SStefano Zampini   }
22669566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
2267ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
22689566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
22699566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2270ccdfe979SStefano Zampini 
2271ccdfe979SStefano Zampini   /* product data */
22729566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2273ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2274afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2275afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2276ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
22779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2278ccdfe979SStefano Zampini   }
2279afb2bd1cSJunchao Zhang  #endif
2280ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2281ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
22829566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
22839566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2284ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
22859566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2286ccdfe979SStefano Zampini     } else {
22879566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2288ccdfe979SStefano Zampini     }
2289ccdfe979SStefano Zampini   }
2290ccdfe979SStefano Zampini   C->product->data    = mmdata;
2291ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2292ccdfe979SStefano Zampini 
2293ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2294ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2295ccdfe979SStefano Zampini }
2296ccdfe979SStefano Zampini 
2297fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2298ccdfe979SStefano Zampini {
2299ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2300fcdce8c4SStefano Zampini   Mat                          A,B;
2301fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2302fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2303fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2304fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2305fcdce8c4SStefano Zampini   PetscBool                    flg;
2306fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2307fcdce8c4SStefano Zampini   MatProductType               ptype;
2308fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2309fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2310fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2311fcdce8c4SStefano Zampini #endif
2312b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2313ccdfe979SStefano Zampini 
2314ccdfe979SStefano Zampini   PetscFunctionBegin;
2315ccdfe979SStefano Zampini   MatCheckProduct(C,1);
231628b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
23179566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
231828b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2319fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2320fcdce8c4SStefano Zampini   A = product->A;
2321fcdce8c4SStefano Zampini   B = product->B;
2322fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2323fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2324fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2325*08401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2326fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
232728b400f6SJacob Faibussowitsch     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2328fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
232928b400f6SJacob Faibussowitsch     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2330fcdce8c4SStefano Zampini     goto finalize;
2331fcdce8c4SStefano Zampini   }
2332fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
23339566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
233428b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
23359566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
233628b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
233728b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
233828b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2339fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2340fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2341fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2342*08401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2343*08401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2344*08401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
23459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23469566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2347fcdce8c4SStefano Zampini 
2348fcdce8c4SStefano Zampini   ptype = product->type;
2349fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2350fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
235128b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2352fa046f9fSJunchao Zhang   }
2353fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2354fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
235528b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2356fa046f9fSJunchao Zhang   }
2357fcdce8c4SStefano Zampini   switch (ptype) {
2358fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2359fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2360fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2361fcdce8c4SStefano Zampini     break;
2362fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2363fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2364fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2365fcdce8c4SStefano Zampini     break;
2366fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2367fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2368fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2369fcdce8c4SStefano Zampini     break;
2370fcdce8c4SStefano Zampini   default:
237198921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2372fcdce8c4SStefano Zampini   }
2373fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
237428b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
237528b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
237628b400f6SJacob Faibussowitsch   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2377fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2378fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2379fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
238028b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
238128b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
238228b400f6SJacob Faibussowitsch   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
23839566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2384fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2385fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
23869566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2387b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2388b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2389b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2390b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
23919566063dSJacob Faibussowitsch                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2392b4285af6SJunchao Zhang   #else
2393b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2394fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2395fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
23969566063dSJacob Faibussowitsch                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2397b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2398fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
23999566063dSJacob Faibussowitsch                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2400b4285af6SJunchao Zhang   #endif
2401fcdce8c4SStefano Zampini #else
2402b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2403fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2404fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2405fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
24069566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2407fcdce8c4SStefano Zampini #endif
24089566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
24099566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
24109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2411fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2412fcdce8c4SStefano Zampini finalize:
2413fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
24149566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
24159566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
24169566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2417fcdce8c4SStefano Zampini   c->reallocs         = 0;
2418fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2419fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2420fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2421fcdce8c4SStefano Zampini   C->num_ass++;
2422ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2423ccdfe979SStefano Zampini }
2424fcdce8c4SStefano Zampini 
2425fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2426fcdce8c4SStefano Zampini {
2427fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2428fcdce8c4SStefano Zampini   Mat                          A,B;
2429fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2430fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2431fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2432fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2433fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2434fcdce8c4SStefano Zampini   PetscBool                    flg;
2435fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2436fcdce8c4SStefano Zampini   MatProductType               ptype;
2437fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2438fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2439fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2440fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2441fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2442fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2443fcdce8c4SStefano Zampini #else
2444fcdce8c4SStefano Zampini   int                          cnz;
2445fcdce8c4SStefano Zampini #endif
2446b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2447fcdce8c4SStefano Zampini 
2448fcdce8c4SStefano Zampini   PetscFunctionBegin;
2449fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
245028b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2451fcdce8c4SStefano Zampini   A    = product->A;
2452fcdce8c4SStefano Zampini   B    = product->B;
24539566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
245428b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
24559566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
245628b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2457fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2458fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2459fcdce8c4SStefano Zampini   /* product data */
24609566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2461fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2462fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2463fcdce8c4SStefano Zampini 
24649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
24659566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2466d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2467d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2468*08401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2469*08401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2470d60bce21SJunchao Zhang 
2471fcdce8c4SStefano Zampini   ptype = product->type;
2472fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2473fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2474fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2475fa046f9fSJunchao Zhang   }
2476fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2477fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2478fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2479fa046f9fSJunchao Zhang   }
2480fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2481fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2482fcdce8c4SStefano Zampini   switch (ptype) {
2483fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2484fcdce8c4SStefano Zampini     m = A->rmap->n;
2485fcdce8c4SStefano Zampini     n = B->cmap->n;
2486fcdce8c4SStefano Zampini     k = A->cmap->n;
2487fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2488fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2489fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2490fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2491fcdce8c4SStefano Zampini     break;
2492fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2493fcdce8c4SStefano Zampini     m = A->cmap->n;
2494fcdce8c4SStefano Zampini     n = B->cmap->n;
2495fcdce8c4SStefano Zampini     k = A->rmap->n;
24969566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2497fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2498fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2499fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2500fcdce8c4SStefano Zampini     break;
2501fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2502fcdce8c4SStefano Zampini     m = A->rmap->n;
2503fcdce8c4SStefano Zampini     n = B->rmap->n;
2504fcdce8c4SStefano Zampini     k = A->cmap->n;
25059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2506fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2507fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2508fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2509fcdce8c4SStefano Zampini     break;
2510fcdce8c4SStefano Zampini   default:
251198921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2512fcdce8c4SStefano Zampini   }
2513fcdce8c4SStefano Zampini 
2514fcdce8c4SStefano Zampini   /* create cusparse matrix */
25159566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
25169566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2517fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2518fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2519fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2520fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2521fcdce8c4SStefano Zampini 
2522fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2523fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2524fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
25259566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
25269566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2527fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2528fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2529fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2530fcdce8c4SStefano Zampini   } else {
2531fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2532fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2533fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2534fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2535fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2536fcdce8c4SStefano Zampini   }
2537fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2538fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2539fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2540fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2541fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2542fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
25439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
25449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
25459566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
25469566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
25479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
25489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
25499566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
25509566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
25519566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2552fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2553fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2554fcdce8c4SStefano Zampini     c->nz = 0;
2555fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2556fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2557fcdce8c4SStefano Zampini     goto finalizesym;
2558fcdce8c4SStefano Zampini   }
2559fcdce8c4SStefano Zampini 
256028b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
256128b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2562fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2563fcdce8c4SStefano Zampini   if (!biscompressed) {
2564fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2565fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2566fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2567fcdce8c4SStefano Zampini #endif
2568fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2569fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2570fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2571fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2572fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2573fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2574fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2575fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2576fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2577fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2578fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
25799566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2580fcdce8c4SStefano Zampini     }
2581fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2582fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2583fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2584fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2585fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2586fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2587fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2588fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
25899566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2590fcdce8c4SStefano Zampini     }
2591fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2592fcdce8c4SStefano Zampini #endif
2593fcdce8c4SStefano Zampini   }
259428b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
259528b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2596fcdce8c4SStefano Zampini   /* precompute flops count */
2597fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2598fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2599fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2600fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2601fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2602fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2603fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2604fcdce8c4SStefano Zampini       }
2605fcdce8c4SStefano Zampini     }
2606fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2607fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2608fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2609fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2610fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2611fcdce8c4SStefano Zampini     }
2612fcdce8c4SStefano Zampini   } else { /* TODO */
2613fcdce8c4SStefano Zampini     flops = 0.;
2614fcdce8c4SStefano Zampini   }
2615fcdce8c4SStefano Zampini 
2616fcdce8c4SStefano Zampini   mmdata->flops = flops;
26179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2618b4285af6SJunchao Zhang 
2619fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
26209566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2621fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2622fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2623fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
26249566063dSJacob Faibussowitsch                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
26259566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2626b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2627b4285af6SJunchao Zhang  {
2628b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2629b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2630b4285af6SJunchao Zhang   */
2631b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2632b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2633b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2634b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2635b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2636b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2637b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2638b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2639b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2640b4285af6SJunchao Zhang 
2641b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2642b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2643b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2644b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26459566063dSJacob Faibussowitsch                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
26469566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2647b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2648b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2649b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26509566063dSJacob Faibussowitsch                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2651b4285af6SJunchao Zhang 
2652b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2653b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2654b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26559566063dSJacob Faibussowitsch                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
26569566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
26579566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
26589566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2659b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2660b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26619566063dSJacob Faibussowitsch                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
26629566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer1));
26639566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer2));
2664b4285af6SJunchao Zhang 
2665b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2666b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
26679566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2668b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2669b4285af6SJunchao Zhang   /* allocate matrix C */
26709566063dSJacob Faibussowitsch   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
26719566063dSJacob Faibussowitsch   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2672b4285af6SJunchao Zhang   /* update matC with the new pointers */
2673b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
26749566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2675b4285af6SJunchao Zhang 
2676b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2677b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2678b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26799566063dSJacob Faibussowitsch                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
26809566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2681b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2682b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26839566063dSJacob Faibussowitsch                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
26849566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer3));
2685b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2686b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2687b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
26889566063dSJacob Faibussowitsch                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
26899566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2690b4285af6SJunchao Zhang  }
2691ae37ee31SJunchao Zhang  #else
2692b4285af6SJunchao Zhang   size_t bufSize2;
2693fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2694b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2695fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2696fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
26979566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
26989566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2699fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2700b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2701fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2702fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27039566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2704fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2705b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2706fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2707fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27089566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2709fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2710fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2711fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2712fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2713fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
27149566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2715fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2716b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2717fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2718fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27199566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2720fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
27219566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2722fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
27239566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2724fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
27259566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2726fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
27279566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2728fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
27299566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2730b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2731fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
27329566063dSJacob Faibussowitsch                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2733ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2734fcdce8c4SStefano Zampini #else
27359566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2736b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2737fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2738fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2739fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
27409566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2741fcdce8c4SStefano Zampini   c->nz = cnz;
2742fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
27439566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2744fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
27459566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2746fcdce8c4SStefano Zampini 
27479566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2748fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2749fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2750fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2751b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2752fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2753fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2754fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
27559566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2756fcdce8c4SStefano Zampini #endif
27579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
27589566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2759fcdce8c4SStefano Zampini finalizesym:
2760fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2761fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2762fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
27639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m+1,&c->i));
27649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->j));
2765fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2766fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2767fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2768fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2769fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2770fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2771fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2774fcdce8c4SStefano Zampini   } else {
2775fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2776fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27789566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2779fcdce8c4SStefano Zampini   }
2780fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2781fcdce8c4SStefano Zampini     PetscInt r = 0;
2782fcdce8c4SStefano Zampini     c->i[0] = 0;
2783fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2784fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2785fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2786fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2787fcdce8c4SStefano Zampini     }
2788fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2789fcdce8c4SStefano Zampini   }
27909566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
27919566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->ilen));
27929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->imax));
2793fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2794fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2795fcdce8c4SStefano Zampini   c->rmax = 0;
2796fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2797fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2798fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2799fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2800fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2801fcdce8c4SStefano Zampini   }
28029566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
28039566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->a));
2804fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2805fcdce8c4SStefano Zampini 
2806fcdce8c4SStefano Zampini   C->nonzerostate++;
28079566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
28089566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
2809fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2810fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2811fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2812fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2813fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2814abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2815fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2816fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2817fcdce8c4SStefano Zampini   }
2818fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2819fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2820fcdce8c4SStefano Zampini }
2821fcdce8c4SStefano Zampini 
2822fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2823fcdce8c4SStefano Zampini 
2824fcdce8c4SStefano Zampini /* handles sparse or dense B */
2825fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2826fcdce8c4SStefano Zampini {
2827fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2828fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2829fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2830fcdce8c4SStefano Zampini 
2831fcdce8c4SStefano Zampini   PetscFunctionBegin;
2832fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
28339566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2834abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
28359566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2836fcdce8c4SStefano Zampini   }
2837fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2838fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2839fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
28409566063dSJacob Faibussowitsch       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2841fcdce8c4SStefano Zampini     }
2842fcdce8c4SStefano Zampini   }
284365e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
284465e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
284565e4b4d4SStefano Zampini     switch (product->type) {
284665e4b4d4SStefano Zampini     case MATPRODUCT_AB:
284765e4b4d4SStefano Zampini       if (product->api_user) {
28489566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");PetscCall(ierr);
28499566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
28509566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
285165e4b4d4SStefano Zampini       } else {
28529566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");PetscCall(ierr);
28539566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
28549566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
285565e4b4d4SStefano Zampini       }
285665e4b4d4SStefano Zampini       break;
285765e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
285865e4b4d4SStefano Zampini       if (product->api_user) {
28599566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");PetscCall(ierr);
28609566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
28619566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
286265e4b4d4SStefano Zampini       } else {
28639566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");PetscCall(ierr);
28649566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
28659566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
286665e4b4d4SStefano Zampini       }
286765e4b4d4SStefano Zampini       break;
286865e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
286965e4b4d4SStefano Zampini       if (product->api_user) {
28709566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");PetscCall(ierr);
28719566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
28729566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
287365e4b4d4SStefano Zampini       } else {
28749566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");PetscCall(ierr);
28759566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
28769566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
287765e4b4d4SStefano Zampini       }
287865e4b4d4SStefano Zampini       break;
287965e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
288065e4b4d4SStefano Zampini       if (product->api_user) {
28819566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");PetscCall(ierr);
28829566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
28839566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
288465e4b4d4SStefano Zampini       } else {
28859566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");PetscCall(ierr);
28869566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
28879566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
288865e4b4d4SStefano Zampini       }
288965e4b4d4SStefano Zampini       break;
289065e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
289165e4b4d4SStefano Zampini       if (product->api_user) {
28929566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");PetscCall(ierr);
28939566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
28949566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
289565e4b4d4SStefano Zampini       } else {
28969566063dSJacob Faibussowitsch         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");PetscCall(ierr);
28979566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
28989566063dSJacob Faibussowitsch         ierr = PetscOptionsEnd();PetscCall(ierr);
289965e4b4d4SStefano Zampini       }
290065e4b4d4SStefano Zampini       break;
290165e4b4d4SStefano Zampini     default:
290265e4b4d4SStefano Zampini       break;
290365e4b4d4SStefano Zampini     }
290465e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
290565e4b4d4SStefano Zampini   }
290665e4b4d4SStefano Zampini   /* dispatch */
2907fcdce8c4SStefano Zampini   if (isdense) {
2908ccdfe979SStefano Zampini     switch (product->type) {
2909ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2910ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2911ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2912ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2913ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2914fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
29159566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2916fcdce8c4SStefano Zampini       } else {
2917fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2918fcdce8c4SStefano Zampini       }
2919fcdce8c4SStefano Zampini       break;
2920fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2921fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2922fcdce8c4SStefano Zampini       break;
2923ccdfe979SStefano Zampini     default:
2924ccdfe979SStefano Zampini       break;
2925ccdfe979SStefano Zampini     }
2926fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2927fcdce8c4SStefano Zampini     switch (product->type) {
2928fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2929fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2930fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2931fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2932fcdce8c4SStefano Zampini       break;
2933fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2934fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2935fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2936fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2937fcdce8c4SStefano Zampini       break;
2938fcdce8c4SStefano Zampini     default:
2939fcdce8c4SStefano Zampini       break;
2940fcdce8c4SStefano Zampini     }
2941fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
29429566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2943fcdce8c4SStefano Zampini   }
2944ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2945ccdfe979SStefano Zampini }
2946ccdfe979SStefano Zampini 
29476fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
29489ae82921SPaul Mullowney {
29499ae82921SPaul Mullowney   PetscFunctionBegin;
29509566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2951e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2952e6e9a74fSStefano Zampini }
2953e6e9a74fSStefano Zampini 
2954e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2955e6e9a74fSStefano Zampini {
2956e6e9a74fSStefano Zampini   PetscFunctionBegin;
29579566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2958e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2959e6e9a74fSStefano Zampini }
2960e6e9a74fSStefano Zampini 
2961e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2962e6e9a74fSStefano Zampini {
2963e6e9a74fSStefano Zampini   PetscFunctionBegin;
29649566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2965e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2966e6e9a74fSStefano Zampini }
2967e6e9a74fSStefano Zampini 
2968e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2969e6e9a74fSStefano Zampini {
2970e6e9a74fSStefano Zampini   PetscFunctionBegin;
29719566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
29729ae82921SPaul Mullowney   PetscFunctionReturn(0);
29739ae82921SPaul Mullowney }
29749ae82921SPaul Mullowney 
29756fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2976ca45077fSPaul Mullowney {
2977ca45077fSPaul Mullowney   PetscFunctionBegin;
29789566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2979ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2980ca45077fSPaul Mullowney }
2981ca45077fSPaul Mullowney 
2982a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2983a0e72f99SJunchao Zhang {
2984a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2985a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2986a0e72f99SJunchao Zhang }
2987a0e72f99SJunchao Zhang 
2988afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2989e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
29909ae82921SPaul Mullowney {
29919ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2992aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
29939ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2994e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2995e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2996e6e9a74fSStefano Zampini   PetscBool                    compressed;
2997afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2998afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2999afb2bd1cSJunchao Zhang #endif
30006e111a19SKarl Rupp 
30019ae82921SPaul Mullowney   PetscFunctionBegin;
3002*08401ef6SPierre Jolivet   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3003cbc6b225SStefano Zampini   if (!a->nz) {
30049566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
30059566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3006e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3007e6e9a74fSStefano Zampini   }
300834d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
30099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3010e6e9a74fSStefano Zampini   if (!trans) {
30119ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
30125f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3013e6e9a74fSStefano Zampini   } else {
30141a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3015e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3016e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3017e6e9a74fSStefano Zampini     } else {
30189566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3019e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3020e6e9a74fSStefano Zampini     }
3021e6e9a74fSStefano Zampini   }
3022e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3023e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3024213423ffSJunchao Zhang 
3025e6e9a74fSStefano Zampini   try {
30269566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
30279566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
30289566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3029afb2bd1cSJunchao Zhang 
30309566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3031e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3032afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3033afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3034afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3035afb2bd1cSJunchao Zhang       */
3036e6e9a74fSStefano Zampini       xptr = xarray;
3037afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3038213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3039afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3040afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3041afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3042afb2bd1cSJunchao Zhang        */
3043afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3044afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3045afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3046afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3047afb2bd1cSJunchao Zhang       }
3048afb2bd1cSJunchao Zhang      #endif
3049e6e9a74fSStefano Zampini     } else {
3050afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3051afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3052afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3053afb2bd1cSJunchao Zhang        */
3054afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3055e6e9a74fSStefano Zampini       dptr = zarray;
3056e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3057afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3058e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3059a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3060e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3061e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3062e6e9a74fSStefano Zampini       }
3063afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3064afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3065afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3066afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3067afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3068afb2bd1cSJunchao Zhang       }
3069afb2bd1cSJunchao Zhang      #endif
3070e6e9a74fSStefano Zampini     }
30719ae82921SPaul Mullowney 
3072afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3073aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3074afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
30755f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3076afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
30779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
30789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
30799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3080afb2bd1cSJunchao Zhang                                                matstruct->matDescr,
3081afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3082afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecYDescr,
3083afb2bd1cSJunchao Zhang                                                cusparse_scalartype,
3084afb2bd1cSJunchao Zhang                                                cusparsestruct->spmvAlg,
30855f80ce2aSJacob Faibussowitsch                                                &matstruct->cuSpMV[opA].spmvBufferSize));
30869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3087afb2bd1cSJunchao Zhang 
3088afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3089afb2bd1cSJunchao Zhang       } else {
3090afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
30919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
30929566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3093afb2bd1cSJunchao Zhang       }
3094afb2bd1cSJunchao Zhang 
30959566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3096afb2bd1cSJunchao Zhang                                   matstruct->alpha_one,
30973606e59fSJunchao Zhang                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3098afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecXDescr,
3099afb2bd1cSJunchao Zhang                                   beta,
3100afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecYDescr,
3101afb2bd1cSJunchao Zhang                                   cusparse_scalartype,
3102afb2bd1cSJunchao Zhang                                   cusparsestruct->spmvAlg,
31035f80ce2aSJacob Faibussowitsch                                   matstruct->cuSpMV[opA].spmvBuffer));
3104afb2bd1cSJunchao Zhang      #else
31057656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
31069566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3107a65300a6SPaul Mullowney                                        mat->num_rows, mat->num_cols,
3108afb2bd1cSJunchao Zhang                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3109aa372e3fSPaul Mullowney                                        mat->values->data().get(), mat->row_offsets->data().get(),
3110e6e9a74fSStefano Zampini                                        mat->column_indices->data().get(), xptr, beta,
31115f80ce2aSJacob Faibussowitsch                                        dptr));
3112afb2bd1cSJunchao Zhang      #endif
3113aa372e3fSPaul Mullowney     } else {
3114213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3115afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3116afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3117afb2bd1cSJunchao Zhang        #else
3118301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
31199566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3120afb2bd1cSJunchao Zhang                                          matstruct->alpha_one, matstruct->descr, hybMat,
3121e6e9a74fSStefano Zampini                                          xptr, beta,
31225f80ce2aSJacob Faibussowitsch                                          dptr));
3123afb2bd1cSJunchao Zhang        #endif
3124a65300a6SPaul Mullowney       }
3125aa372e3fSPaul Mullowney     }
31269566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3127aa372e3fSPaul Mullowney 
3128e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3129213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3130213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
31319566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3132e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
31339566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
31347656d835SStefano Zampini         }
3135213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
31369566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz,0));
31377656d835SStefano Zampini       }
31387656d835SStefano Zampini 
3139213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3140213423ffSJunchao Zhang       if (compressed) {
31419566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3142a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3143a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3144a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3145a0e72f99SJunchao Zhang          */
3146a0e72f99SJunchao Zhang        #if 0
3147a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3148a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3149a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3150e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3151c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3152a0e72f99SJunchao Zhang        #else
3153a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3154a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3155a0e72f99SJunchao Zhang        #endif
31569566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3157e6e9a74fSStefano Zampini       }
3158e6e9a74fSStefano Zampini     } else {
3159e6e9a74fSStefano Zampini       if (yy && yy != zz) {
31609566063dSJacob Faibussowitsch         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3161e6e9a74fSStefano Zampini       }
3162e6e9a74fSStefano Zampini     }
31639566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
31649566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
31659566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
31669ae82921SPaul Mullowney   } catch(char *ex) {
316798921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
31689ae82921SPaul Mullowney   }
3169e6e9a74fSStefano Zampini   if (yy) {
31709566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3171e6e9a74fSStefano Zampini   } else {
31729566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3173e6e9a74fSStefano Zampini   }
31749ae82921SPaul Mullowney   PetscFunctionReturn(0);
31759ae82921SPaul Mullowney }
31769ae82921SPaul Mullowney 
31776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3178ca45077fSPaul Mullowney {
3179ca45077fSPaul Mullowney   PetscFunctionBegin;
31809566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3181ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3182ca45077fSPaul Mullowney }
3183ca45077fSPaul Mullowney 
31846fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
31859ae82921SPaul Mullowney {
3186042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3187042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
31883fa6b06aSMark Adams 
3189042217e8SBarry Smith   PetscFunctionBegin;
31909566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3191042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3192042217e8SBarry Smith 
31939566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
31949566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3195042217e8SBarry Smith     cusp->deviceMat = NULL;
3196042217e8SBarry Smith   }
31979ae82921SPaul Mullowney   PetscFunctionReturn(0);
31989ae82921SPaul Mullowney }
31999ae82921SPaul Mullowney 
32009ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3201e057df02SPaul Mullowney /*@
32029ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3203e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3204e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3205e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3206e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3207e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32089ae82921SPaul Mullowney 
3209d083f849SBarry Smith    Collective
32109ae82921SPaul Mullowney 
32119ae82921SPaul Mullowney    Input Parameters:
32129ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32139ae82921SPaul Mullowney .  m - number of rows
32149ae82921SPaul Mullowney .  n - number of columns
32159ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32169ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32170298fd71SBarry Smith          (possibly different for each row) or NULL
32189ae82921SPaul Mullowney 
32199ae82921SPaul Mullowney    Output Parameter:
32209ae82921SPaul Mullowney .  A - the matrix
32219ae82921SPaul Mullowney 
32229ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
32239ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
32249ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
32259ae82921SPaul Mullowney 
32269ae82921SPaul Mullowney    Notes:
32279ae82921SPaul Mullowney    If nnz is given then nz is ignored
32289ae82921SPaul Mullowney 
32299ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
32309ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
32319ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
32329ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
32339ae82921SPaul Mullowney 
32349ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
32350298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
32369ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
32379ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
32389ae82921SPaul Mullowney 
32399ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
32409ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
32419ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
32429ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
32439ae82921SPaul Mullowney 
32449ae82921SPaul Mullowney    Level: intermediate
32459ae82921SPaul Mullowney 
3246e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
32479ae82921SPaul Mullowney @*/
32489ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
32499ae82921SPaul Mullowney {
32509ae82921SPaul Mullowney   PetscFunctionBegin;
32519566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
32529566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
32539566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
32549566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
32559ae82921SPaul Mullowney   PetscFunctionReturn(0);
32569ae82921SPaul Mullowney }
32579ae82921SPaul Mullowney 
32586fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
32599ae82921SPaul Mullowney {
32609ae82921SPaul Mullowney   PetscFunctionBegin;
32619ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
32629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
32639ae82921SPaul Mullowney   } else {
32649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3265aa372e3fSPaul Mullowney   }
32669566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
32679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
32689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
32699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
32709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
32719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
32729566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
32739566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
32749566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
32759566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
32769566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
32779ae82921SPaul Mullowney   PetscFunctionReturn(0);
32789ae82921SPaul Mullowney }
32799ae82921SPaul Mullowney 
3280ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
328195639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
32829ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
32839ff858a8SKarl Rupp {
32849ff858a8SKarl Rupp   PetscFunctionBegin;
32859566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
32869566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
32879ff858a8SKarl Rupp   PetscFunctionReturn(0);
32889ff858a8SKarl Rupp }
32899ff858a8SKarl Rupp 
3290039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
329195639643SRichard Tran Mills {
3292a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3293039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3294039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3295039c6fbaSStefano Zampini   PetscScalar        *ay;
3296039c6fbaSStefano Zampini   const PetscScalar  *ax;
3297039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3298e6e9a74fSStefano Zampini 
329995639643SRichard Tran Mills   PetscFunctionBegin;
3300a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3301a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3302039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
33039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
33049566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3305a587d139SMark     PetscFunctionReturn(0);
330695639643SRichard Tran Mills   }
3307039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
33089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
33099566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
33105f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
33115f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3312039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3313039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3314039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3315039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3316039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3317039c6fbaSStefano Zampini     if (eq) {
3318039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3319039c6fbaSStefano Zampini     }
3320039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3321039c6fbaSStefano Zampini   }
3322d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3323d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3324039c6fbaSStefano Zampini 
3325039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3326039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3327039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3328039c6fbaSStefano Zampini     size_t      bufferSize;
3329039c6fbaSStefano Zampini     void        *buffer;
3330039c6fbaSStefano Zampini #endif
3331039c6fbaSStefano Zampini 
33329566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33339566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33349566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3335039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33369566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3337039c6fbaSStefano Zampini                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3338039c6fbaSStefano Zampini                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33395f80ce2aSJacob Faibussowitsch                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
33409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
33419566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33429566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3343039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3344039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33455f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
33469566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
33479566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
33489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3349039c6fbaSStefano Zampini #else
33509566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33519566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3352039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3353039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33545f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
33559566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
33569566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3357039c6fbaSStefano Zampini #endif
33589566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
33599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33609566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33619566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3362039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3363a587d139SMark     cublasHandle_t cublasv2handle;
3364a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3365039c6fbaSStefano Zampini 
33669566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33679566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33689566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
33699566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz,&bnz));
33709566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33719566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
33729566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*bnz));
33739566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
33749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33759566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33769566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3377039c6fbaSStefano Zampini   } else {
33789566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
33799566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3380a587d139SMark   }
338195639643SRichard Tran Mills   PetscFunctionReturn(0);
338295639643SRichard Tran Mills }
338395639643SRichard Tran Mills 
338433c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
338533c9ba73SStefano Zampini {
338633c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
338733c9ba73SStefano Zampini   PetscScalar    *ay;
338833c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
338933c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
339033c9ba73SStefano Zampini 
339133c9ba73SStefano Zampini   PetscFunctionBegin;
33929566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33939566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
33949566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz,&bnz));
33959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
33969566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
33979566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
33989566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
33999566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
34009566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
340133c9ba73SStefano Zampini   PetscFunctionReturn(0);
340233c9ba73SStefano Zampini }
340333c9ba73SStefano Zampini 
34043fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34053fa6b06aSMark Adams {
34067e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3407a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34087e8381f9SStefano Zampini 
34093fa6b06aSMark Adams   PetscFunctionBegin;
34103fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
34113fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
34127e8381f9SStefano Zampini     if (spptr->mat) {
34137e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
34147e8381f9SStefano Zampini       if (matrix->values) {
34157e8381f9SStefano Zampini         both = PETSC_TRUE;
34167e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34177e8381f9SStefano Zampini       }
34187e8381f9SStefano Zampini     }
34197e8381f9SStefano Zampini     if (spptr->matTranspose) {
34207e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
34217e8381f9SStefano Zampini       if (matrix->values) {
34227e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34237e8381f9SStefano Zampini       }
34247e8381f9SStefano Zampini     }
34253fa6b06aSMark Adams   }
34269566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
34279566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
34287e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3429a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
34303fa6b06aSMark Adams   PetscFunctionReturn(0);
34313fa6b06aSMark Adams }
34323fa6b06aSMark Adams 
3433a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3434a587d139SMark {
3435a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3436a587d139SMark 
3437a587d139SMark   PetscFunctionBegin;
34389a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
34399a14fc28SStefano Zampini     A->boundtocpu = flg;
34409a14fc28SStefano Zampini     PetscFunctionReturn(0);
34419a14fc28SStefano Zampini   }
3442a587d139SMark   if (flg) {
34439566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3444a587d139SMark 
344533c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3446a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3447a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3448a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3449a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3450a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3451a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3452a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3453a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3454fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
34559566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
34569566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
34579566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
34589566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
34599566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
34609566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
34619566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
34629566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3463a587d139SMark   } else {
346433c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3465a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3466a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3467a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3468a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3469a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3470a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3471a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3472a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3473fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
347467a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
347567a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
347667a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
347767a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
347867a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
347967a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
34807ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
34817ee59b9bSJunchao Zhang 
34829566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
34839566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34849566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34859566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
34869566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
34879566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3488a587d139SMark    }
3489a587d139SMark   A->boundtocpu = flg;
3490ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3491ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3492ea500dcfSRichard Tran Mills   } else {
3493ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3494ea500dcfSRichard Tran Mills   }
3495a587d139SMark   PetscFunctionReturn(0);
3496a587d139SMark }
3497a587d139SMark 
349849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
34999ae82921SPaul Mullowney {
350049735bf3SStefano Zampini   Mat              B;
35019ae82921SPaul Mullowney 
35029ae82921SPaul Mullowney   PetscFunctionBegin;
35039566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
350449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
35059566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
350649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
35079566063dSJacob Faibussowitsch     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
350849735bf3SStefano Zampini   }
350949735bf3SStefano Zampini   B = *newmat;
351049735bf3SStefano Zampini 
35119566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
35129566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
351334136279SStefano Zampini 
351449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
35159ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3516e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
35179566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
35189566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
35199566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
35201a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3521d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
35228efa179dSJose E. Roman      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3523a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3524a435da06SStefano Zampini      #else
3525d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3526a435da06SStefano Zampini      #endif
3527d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3528d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3529d8132acaSStefano Zampini      #endif
35301a2c6b5cSJunchao Zhang       B->spptr = spptr;
35319ae82921SPaul Mullowney     } else {
3532e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3533e6e9a74fSStefano Zampini 
35349566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
35359566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
35369566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3537e6e9a74fSStefano Zampini       B->spptr = spptr;
35389ae82921SPaul Mullowney     }
3539e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
354049735bf3SStefano Zampini   }
3541693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
35429ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
35431a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
35449ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
354595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3546693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
35472205254eSKarl Rupp 
35489566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
35499566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
35509566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3551ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
35529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3553ae48a8d0SStefano Zampini #endif
35549566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
35559ae82921SPaul Mullowney   PetscFunctionReturn(0);
35569ae82921SPaul Mullowney }
35579ae82921SPaul Mullowney 
355802fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
355902fe1965SBarry Smith {
356002fe1965SBarry Smith   PetscFunctionBegin;
35619566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
35629566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
356302fe1965SBarry Smith   PetscFunctionReturn(0);
356402fe1965SBarry Smith }
356502fe1965SBarry Smith 
35663ca39a21SBarry Smith /*MC
3567e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3568e057df02SPaul Mullowney 
3569e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
35702692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
35712692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3572e057df02SPaul Mullowney 
3573e057df02SPaul Mullowney    Options Database Keys:
3574e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3575aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3576a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3577365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3578e057df02SPaul Mullowney 
3579e057df02SPaul Mullowney   Level: beginner
3580e057df02SPaul Mullowney 
35818468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3582e057df02SPaul Mullowney M*/
35837f756511SDominic Meiser 
3584bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
35850f39cd5aSBarry Smith 
35863ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
358742c9c57cSBarry Smith {
358842c9c57cSBarry Smith   PetscFunctionBegin;
35899566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
35909566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
35919566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
35929566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
35939566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3594bddcd29dSMark Adams 
359542c9c57cSBarry Smith   PetscFunctionReturn(0);
359642c9c57cSBarry Smith }
359729b38603SBarry Smith 
3598cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3599cbc6b225SStefano Zampini {
3600cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3601cbc6b225SStefano Zampini 
3602cbc6b225SStefano Zampini   PetscFunctionBegin;
3603cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3604cbc6b225SStefano Zampini   delete cusp->cooPerm;
3605cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3606cbc6b225SStefano Zampini   cusp->cooPerm = NULL;
3607cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3608cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
36099566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
36109566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3611cbc6b225SStefano Zampini   }
3612cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3613cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3614cbc6b225SStefano Zampini }
3615cbc6b225SStefano Zampini 
3616470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36177f756511SDominic Meiser {
36187f756511SDominic Meiser   PetscFunctionBegin;
36197f756511SDominic Meiser   if (*cusparsestruct) {
36209566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
36219566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
36227f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
362381902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
36247e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
36257e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3626a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
36279566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
36289566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
36299566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
36309566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
36317f756511SDominic Meiser   }
36327f756511SDominic Meiser   PetscFunctionReturn(0);
36337f756511SDominic Meiser }
36347f756511SDominic Meiser 
36357f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
36367f756511SDominic Meiser {
36377f756511SDominic Meiser   PetscFunctionBegin;
36387f756511SDominic Meiser   if (*mat) {
36397f756511SDominic Meiser     delete (*mat)->values;
36407f756511SDominic Meiser     delete (*mat)->column_indices;
36417f756511SDominic Meiser     delete (*mat)->row_offsets;
36427f756511SDominic Meiser     delete *mat;
36437f756511SDominic Meiser     *mat = 0;
36447f756511SDominic Meiser   }
36457f756511SDominic Meiser   PetscFunctionReturn(0);
36467f756511SDominic Meiser }
36477f756511SDominic Meiser 
3648470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
36497f756511SDominic Meiser {
36507f756511SDominic Meiser   PetscFunctionBegin;
36517f756511SDominic Meiser   if (*trifactor) {
36529566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
36539566063dSJacob Faibussowitsch     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
36549566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
36559566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
36569566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3657afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
36589566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3659afb2bd1cSJunchao Zhang    #endif
36609566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
36617f756511SDominic Meiser   }
36627f756511SDominic Meiser   PetscFunctionReturn(0);
36637f756511SDominic Meiser }
36647f756511SDominic Meiser 
3665470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
36667f756511SDominic Meiser {
36677f756511SDominic Meiser   CsrMatrix        *mat;
36687f756511SDominic Meiser 
36697f756511SDominic Meiser   PetscFunctionBegin;
36707f756511SDominic Meiser   if (*matstruct) {
36717f756511SDominic Meiser     if ((*matstruct)->mat) {
36727f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3673afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3674afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3675afb2bd1cSJunchao Zhang        #else
36767f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
36779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3678afb2bd1cSJunchao Zhang        #endif
36797f756511SDominic Meiser       } else {
36807f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
36817f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
36827f756511SDominic Meiser       }
36837f756511SDominic Meiser     }
36849566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
36857f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
36869566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
36879566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
36889566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3689afb2bd1cSJunchao Zhang 
3690afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3691afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
36929566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3693afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3694afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
36959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
36969566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
36979566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3698afb2bd1cSJunchao Zhang       }
3699afb2bd1cSJunchao Zhang     }
3700afb2bd1cSJunchao Zhang    #endif
37017f756511SDominic Meiser     delete *matstruct;
37027e8381f9SStefano Zampini     *matstruct = NULL;
37037f756511SDominic Meiser   }
37047f756511SDominic Meiser   PetscFunctionReturn(0);
37057f756511SDominic Meiser }
37067f756511SDominic Meiser 
3707e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37087f756511SDominic Meiser {
37097f756511SDominic Meiser   PetscFunctionBegin;
37107f756511SDominic Meiser   if (*trifactors) {
37119566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
37129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
37139566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
37149566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
37157f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
37167f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
37177f756511SDominic Meiser     delete (*trifactors)->workVector;
37187e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
37197e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
37207e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
37219566063dSJacob Faibussowitsch     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
37229566063dSJacob Faibussowitsch     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3723e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3724ccdfe979SStefano Zampini   }
3725ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3726ccdfe979SStefano Zampini }
3727ccdfe979SStefano Zampini 
3728ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3729ccdfe979SStefano Zampini {
3730ccdfe979SStefano Zampini   cusparseHandle_t handle;
3731ccdfe979SStefano Zampini 
3732ccdfe979SStefano Zampini   PetscFunctionBegin;
3733ccdfe979SStefano Zampini   if (*trifactors) {
37349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
37357f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
37369566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseDestroy(handle));
37377f756511SDominic Meiser     }
37389566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
37397f756511SDominic Meiser   }
37407f756511SDominic Meiser   PetscFunctionReturn(0);
37417f756511SDominic Meiser }
37427e8381f9SStefano Zampini 
37437e8381f9SStefano Zampini struct IJCompare
37447e8381f9SStefano Zampini {
37457e8381f9SStefano Zampini   __host__ __device__
37467e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37477e8381f9SStefano Zampini   {
37487e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
37497e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
37507e8381f9SStefano Zampini     return false;
37517e8381f9SStefano Zampini   }
37527e8381f9SStefano Zampini };
37537e8381f9SStefano Zampini 
37547e8381f9SStefano Zampini struct IJEqual
37557e8381f9SStefano Zampini {
37567e8381f9SStefano Zampini   __host__ __device__
37577e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37587e8381f9SStefano Zampini   {
37597e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
37607e8381f9SStefano Zampini     return true;
37617e8381f9SStefano Zampini   }
37627e8381f9SStefano Zampini };
37637e8381f9SStefano Zampini 
37647e8381f9SStefano Zampini struct IJDiff
37657e8381f9SStefano Zampini {
37667e8381f9SStefano Zampini   __host__ __device__
37677e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37687e8381f9SStefano Zampini   {
37697e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
37707e8381f9SStefano Zampini   }
37717e8381f9SStefano Zampini };
37727e8381f9SStefano Zampini 
37737e8381f9SStefano Zampini struct IJSum
37747e8381f9SStefano Zampini {
37757e8381f9SStefano Zampini   __host__ __device__
37767e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37777e8381f9SStefano Zampini   {
37787e8381f9SStefano Zampini     return t1||t2;
37797e8381f9SStefano Zampini   }
37807e8381f9SStefano Zampini };
37817e8381f9SStefano Zampini 
37827e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3783219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3784219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
37857e8381f9SStefano Zampini {
37867e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3787fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3788bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
378908391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
37907e8381f9SStefano Zampini   CsrMatrix                             *matrix;
37917e8381f9SStefano Zampini   PetscInt                              n;
37927e8381f9SStefano Zampini 
37937e8381f9SStefano Zampini   PetscFunctionBegin;
379428b400f6SJacob Faibussowitsch   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
379528b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
37967e8381f9SStefano Zampini   if (!cusp->cooPerm) {
37979566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
37989566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
37997e8381f9SStefano Zampini     PetscFunctionReturn(0);
38007e8381f9SStefano Zampini   }
38017e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
380228b400f6SJacob Faibussowitsch   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3803e61fc153SStefano Zampini   if (!v) {
3804e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3805e61fc153SStefano Zampini     goto finalize;
38067e8381f9SStefano Zampini   }
3807e61fc153SStefano Zampini   n = cusp->cooPerm->size();
380808391a17SStefano Zampini   if (isCudaMem(v)) {
380908391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
381008391a17SStefano Zampini   } else {
3811e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3812e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
381308391a17SStefano Zampini     d_v = cooPerm_v->data();
38149566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
381508391a17SStefano Zampini   }
38169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3817e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3818ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3819bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
382008391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3821ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3822ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3823ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3824ddea5d60SJunchao Zhang       */
3825e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3826e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3827e61fc153SStefano Zampini       delete cooPerm_w;
38287e8381f9SStefano Zampini     } else {
3829ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
383008391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38317e8381f9SStefano Zampini                                                                 matrix->values->begin()));
383208391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38337e8381f9SStefano Zampini                                                                 matrix->values->end()));
3834ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
38357e8381f9SStefano Zampini     }
38367e8381f9SStefano Zampini   } else {
3837e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
383808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3839e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
38407e8381f9SStefano Zampini     } else {
384108391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38427e8381f9SStefano Zampini                                                                 matrix->values->begin()));
384308391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38447e8381f9SStefano Zampini                                                                 matrix->values->end()));
38457e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
38467e8381f9SStefano Zampini     }
38477e8381f9SStefano Zampini   }
38489566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3849e61fc153SStefano Zampini finalize:
3850e61fc153SStefano Zampini   delete cooPerm_v;
38517e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
38529566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3853fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
38549566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
38559566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
38569566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3857fcdce8c4SStefano Zampini   a->reallocs         = 0;
3858fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3859fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3860fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3861fcdce8c4SStefano Zampini   A->num_ass++;
38627e8381f9SStefano Zampini   PetscFunctionReturn(0);
38637e8381f9SStefano Zampini }
38647e8381f9SStefano Zampini 
3865a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3866a49f1ed0SStefano Zampini {
3867a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3868a49f1ed0SStefano Zampini 
3869a49f1ed0SStefano Zampini   PetscFunctionBegin;
3870a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3871a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3872a49f1ed0SStefano Zampini   if (destroy) {
38739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3874a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3875a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3876a49f1ed0SStefano Zampini   }
38771a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3878a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3879a49f1ed0SStefano Zampini }
3880a49f1ed0SStefano Zampini 
38817e8381f9SStefano Zampini #include <thrust/binary_search.h>
3882219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3883219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
38847e8381f9SStefano Zampini {
38857e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
38867e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
38877e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
38887e8381f9SStefano Zampini 
38897e8381f9SStefano Zampini   PetscFunctionBegin;
38909566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
38919566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
38927e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
38937e8381f9SStefano Zampini   if (n != cooPerm_n) {
38947e8381f9SStefano Zampini     delete cusp->cooPerm;
38957e8381f9SStefano Zampini     delete cusp->cooPerm_a;
38967e8381f9SStefano Zampini     cusp->cooPerm = NULL;
38977e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
38987e8381f9SStefano Zampini   }
38997e8381f9SStefano Zampini   if (n) {
39007e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
39017e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39027e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39037e8381f9SStefano Zampini 
39047e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
39057e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39067e8381f9SStefano Zampini 
39079566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
39087e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
39097e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
3910ddea5d60SJunchao Zhang 
3911ddea5d60SJunchao Zhang     /* Ex.
3912ddea5d60SJunchao Zhang       n = 6
3913ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
3914ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
3915ddea5d60SJunchao Zhang     */
39167e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
39177e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
39187e8381f9SStefano Zampini 
39199566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39207e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3921ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3922ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
39237e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
39247e8381f9SStefano Zampini 
3925ddea5d60SJunchao Zhang     /*
3926ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
3927ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
3928ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
3929ddea5d60SJunchao Zhang     */
3930ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3931ddea5d60SJunchao Zhang 
3932ddea5d60SJunchao Zhang     /*
3933ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
3934ddea5d60SJunchao Zhang                             ^ekey
3935ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
3936ddea5d60SJunchao Zhang                            ^nekye
3937ddea5d60SJunchao Zhang     */
39387e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
39397e8381f9SStefano Zampini       delete cusp->cooPerm_a;
39407e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
3941ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3942ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3943ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3944ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3945ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
39467e8381f9SStefano Zampini       w[0] = 0;
3947ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3948ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
39497e8381f9SStefano Zampini     }
39507e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
3951ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3952ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3953ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
39549566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39557e8381f9SStefano Zampini 
39569566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
39577e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
39587e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
39597e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
39609566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3961ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
39629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39637e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3964fcdce8c4SStefano Zampini     a->rmax = 0;
39659566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->a));
39669566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->j));
39679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39689566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
39699566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
39707e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
39717e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
39727e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
39737e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3974fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
39757e8381f9SStefano Zampini     }
3976fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
39777e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
39789566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
39799566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
39807e8381f9SStefano Zampini   } else {
39819566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
39827e8381f9SStefano Zampini   }
39839566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
39847e8381f9SStefano Zampini 
39857e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3986e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
39879566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->nz));
39889566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
39897e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
39909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
39919566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
39927e8381f9SStefano Zampini   PetscFunctionReturn(0);
39937e8381f9SStefano Zampini }
3994ed502f03SStefano Zampini 
3995219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3996219fbbafSJunchao Zhang {
3997219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
3998219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
3999cbc6b225SStefano Zampini   PetscBool          coo_basic = PETSC_TRUE;
4000219fbbafSJunchao Zhang   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4001219fbbafSJunchao Zhang 
4002219fbbafSJunchao Zhang   PetscFunctionBegin;
40039566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
40049566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4005219fbbafSJunchao Zhang   if (coo_i) {
40069566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i,&mtype));
4007219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4008219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) {
4009cbc6b225SStefano Zampini         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4010219fbbafSJunchao Zhang       }
4011219fbbafSJunchao Zhang     }
4012219fbbafSJunchao Zhang   }
4013219fbbafSJunchao Zhang 
4014219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
40159566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4016219fbbafSJunchao Zhang   } else {
40179566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4018cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
40199566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4020219fbbafSJunchao Zhang     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4021219fbbafSJunchao Zhang     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
40229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
40239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
40249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
40259566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4026219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4027219fbbafSJunchao Zhang   }
4028219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4029219fbbafSJunchao Zhang }
4030219fbbafSJunchao Zhang 
4031b6c38306SJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4032219fbbafSJunchao Zhang {
4033219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4034219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
4035b6c38306SJunchao Zhang   for (; i<nnz; i+= grid_size) {
4036b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4037b6c38306SJunchao Zhang     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4038b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4039b6c38306SJunchao Zhang   }
4040219fbbafSJunchao Zhang }
4041219fbbafSJunchao Zhang 
4042219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4043219fbbafSJunchao Zhang {
4044219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4045219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4046219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4047219fbbafSJunchao Zhang   PetscMemType        memtype;
4048219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4049219fbbafSJunchao Zhang   PetscScalar         *Aa;
4050219fbbafSJunchao Zhang 
4051219fbbafSJunchao Zhang   PetscFunctionBegin;
4052219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
40539566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v,&memtype));
4054219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
40559566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
40569566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4057219fbbafSJunchao Zhang     }
4058219fbbafSJunchao Zhang 
40599566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
40609566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4061219fbbafSJunchao Zhang 
4062cbc6b225SStefano Zampini     if (Annz) {
4063b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
40649566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4065cbc6b225SStefano Zampini     }
4066219fbbafSJunchao Zhang 
40679566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
40689566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4069219fbbafSJunchao Zhang 
40709566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4071219fbbafSJunchao Zhang   } else {
40729566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4073219fbbafSJunchao Zhang   }
4074219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4075219fbbafSJunchao Zhang }
4076219fbbafSJunchao Zhang 
40775b7e41feSStefano Zampini /*@C
40785b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40795b7e41feSStefano Zampini 
40805b7e41feSStefano Zampini    Not collective
40815b7e41feSStefano Zampini 
40825b7e41feSStefano Zampini     Input Parameters:
40835b7e41feSStefano Zampini +   A - the matrix
40845b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40855b7e41feSStefano Zampini 
40865b7e41feSStefano Zampini     Output Parameters:
40875b7e41feSStefano Zampini +   ia - the CSR row pointers
40885b7e41feSStefano Zampini -   ja - the CSR column indices
40895b7e41feSStefano Zampini 
40905b7e41feSStefano Zampini     Level: developer
40915b7e41feSStefano Zampini 
40925b7e41feSStefano Zampini     Notes:
40935b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
40945b7e41feSStefano Zampini 
40955b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
40965b7e41feSStefano Zampini @*/
40975f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
40985f101d05SStefano Zampini {
40995f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
41005f101d05SStefano Zampini   CsrMatrix          *csr;
41015f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
41025f101d05SStefano Zampini 
41035f101d05SStefano Zampini   PetscFunctionBegin;
41045f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41055f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41065f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41072c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
410928b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41105f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41115f101d05SStefano Zampini   if (i) {
41125f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41135f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41145f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41155f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41169566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
41175f101d05SStefano Zampini       }
41185f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41195f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41205f101d05SStefano Zampini   }
41215f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41225f101d05SStefano Zampini   PetscFunctionReturn(0);
41235f101d05SStefano Zampini }
41245f101d05SStefano Zampini 
41255b7e41feSStefano Zampini /*@C
41265b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41275b7e41feSStefano Zampini 
41285b7e41feSStefano Zampini    Not collective
41295b7e41feSStefano Zampini 
41305b7e41feSStefano Zampini     Input Parameters:
41315b7e41feSStefano Zampini +   A - the matrix
41325b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41335b7e41feSStefano Zampini 
41345b7e41feSStefano Zampini     Output Parameters:
41355b7e41feSStefano Zampini +   ia - the CSR row pointers
41365b7e41feSStefano Zampini -   ja - the CSR column indices
41375b7e41feSStefano Zampini 
41385b7e41feSStefano Zampini     Level: developer
41395b7e41feSStefano Zampini 
41405b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41415b7e41feSStefano Zampini @*/
41425f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41435f101d05SStefano Zampini {
41445f101d05SStefano Zampini   PetscFunctionBegin;
41455f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41465f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41475f101d05SStefano Zampini   if (i) *i = NULL;
41485f101d05SStefano Zampini   if (j) *j = NULL;
41495f101d05SStefano Zampini   PetscFunctionReturn(0);
41505f101d05SStefano Zampini }
41515f101d05SStefano Zampini 
41525b7e41feSStefano Zampini /*@C
41535b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41545b7e41feSStefano Zampini 
41555b7e41feSStefano Zampini    Not Collective
41565b7e41feSStefano Zampini 
41575b7e41feSStefano Zampini    Input Parameter:
41585b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41595b7e41feSStefano Zampini 
41605b7e41feSStefano Zampini    Output Parameter:
41615b7e41feSStefano Zampini .   a - pointer to the device data
41625b7e41feSStefano Zampini 
41635b7e41feSStefano Zampini    Level: developer
41645b7e41feSStefano Zampini 
41655b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41665b7e41feSStefano Zampini 
41675b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41685b7e41feSStefano Zampini @*/
4169ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4170ed502f03SStefano Zampini {
4171ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4172ed502f03SStefano Zampini   CsrMatrix          *csr;
4173ed502f03SStefano Zampini 
4174ed502f03SStefano Zampini   PetscFunctionBegin;
4175ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4176ed502f03SStefano Zampini   PetscValidPointer(a,2);
4177ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41782c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
418028b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4181ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
418228b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4183ed502f03SStefano Zampini   *a = csr->values->data().get();
4184ed502f03SStefano Zampini   PetscFunctionReturn(0);
4185ed502f03SStefano Zampini }
4186ed502f03SStefano Zampini 
41875b7e41feSStefano Zampini /*@C
41885b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
41895b7e41feSStefano Zampini 
41905b7e41feSStefano Zampini    Not Collective
41915b7e41feSStefano Zampini 
41925b7e41feSStefano Zampini    Input Parameter:
41935b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41945b7e41feSStefano Zampini 
41955b7e41feSStefano Zampini    Output Parameter:
41965b7e41feSStefano Zampini .   a - pointer to the device data
41975b7e41feSStefano Zampini 
41985b7e41feSStefano Zampini    Level: developer
41995b7e41feSStefano Zampini 
42005b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
42015b7e41feSStefano Zampini @*/
4202ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4203ed502f03SStefano Zampini {
4204ed502f03SStefano Zampini   PetscFunctionBegin;
4205ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4206ed502f03SStefano Zampini   PetscValidPointer(a,2);
4207ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4208ed502f03SStefano Zampini   *a = NULL;
4209ed502f03SStefano Zampini   PetscFunctionReturn(0);
4210ed502f03SStefano Zampini }
4211ed502f03SStefano Zampini 
42125b7e41feSStefano Zampini /*@C
42135b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42145b7e41feSStefano Zampini 
42155b7e41feSStefano Zampini    Not Collective
42165b7e41feSStefano Zampini 
42175b7e41feSStefano Zampini    Input Parameter:
42185b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42195b7e41feSStefano Zampini 
42205b7e41feSStefano Zampini    Output Parameter:
42215b7e41feSStefano Zampini .   a - pointer to the device data
42225b7e41feSStefano Zampini 
42235b7e41feSStefano Zampini    Level: developer
42245b7e41feSStefano Zampini 
42255b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42265b7e41feSStefano Zampini 
42275b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
42285b7e41feSStefano Zampini @*/
4229039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4230039c6fbaSStefano Zampini {
4231039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4232039c6fbaSStefano Zampini   CsrMatrix          *csr;
4233039c6fbaSStefano Zampini 
4234039c6fbaSStefano Zampini   PetscFunctionBegin;
4235039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4236039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4237039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42382c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
42399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
424028b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4241039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
424228b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4243039c6fbaSStefano Zampini   *a = csr->values->data().get();
4244039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
42459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4246039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4247039c6fbaSStefano Zampini }
42485b7e41feSStefano Zampini /*@C
42495b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4250039c6fbaSStefano Zampini 
42515b7e41feSStefano Zampini    Not Collective
42525b7e41feSStefano Zampini 
42535b7e41feSStefano Zampini    Input Parameter:
42545b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42555b7e41feSStefano Zampini 
42565b7e41feSStefano Zampini    Output Parameter:
42575b7e41feSStefano Zampini .   a - pointer to the device data
42585b7e41feSStefano Zampini 
42595b7e41feSStefano Zampini    Level: developer
42605b7e41feSStefano Zampini 
42615b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42625b7e41feSStefano Zampini @*/
4263039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4264039c6fbaSStefano Zampini {
4265039c6fbaSStefano Zampini   PetscFunctionBegin;
4266039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4267039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4268039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42699566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
42709566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4271039c6fbaSStefano Zampini   *a = NULL;
4272039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4273039c6fbaSStefano Zampini }
4274039c6fbaSStefano Zampini 
42755b7e41feSStefano Zampini /*@C
42765b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42775b7e41feSStefano Zampini 
42785b7e41feSStefano Zampini    Not Collective
42795b7e41feSStefano Zampini 
42805b7e41feSStefano Zampini    Input Parameter:
42815b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42825b7e41feSStefano Zampini 
42835b7e41feSStefano Zampini    Output Parameter:
42845b7e41feSStefano Zampini .   a - pointer to the device data
42855b7e41feSStefano Zampini 
42865b7e41feSStefano Zampini    Level: developer
42875b7e41feSStefano Zampini 
42885b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
42895b7e41feSStefano Zampini 
42905b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
42915b7e41feSStefano Zampini @*/
4292ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4293ed502f03SStefano Zampini {
4294ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4295ed502f03SStefano Zampini   CsrMatrix          *csr;
4296ed502f03SStefano Zampini 
4297ed502f03SStefano Zampini   PetscFunctionBegin;
4298ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4299ed502f03SStefano Zampini   PetscValidPointer(a,2);
4300ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
43012c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
430228b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4303ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
430428b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4305ed502f03SStefano Zampini   *a = csr->values->data().get();
4306039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
43079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4308ed502f03SStefano Zampini   PetscFunctionReturn(0);
4309ed502f03SStefano Zampini }
4310ed502f03SStefano Zampini 
43115b7e41feSStefano Zampini /*@C
43125b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43135b7e41feSStefano Zampini 
43145b7e41feSStefano Zampini    Not Collective
43155b7e41feSStefano Zampini 
43165b7e41feSStefano Zampini    Input Parameter:
43175b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43185b7e41feSStefano Zampini 
43195b7e41feSStefano Zampini    Output Parameter:
43205b7e41feSStefano Zampini .   a - pointer to the device data
43215b7e41feSStefano Zampini 
43225b7e41feSStefano Zampini    Level: developer
43235b7e41feSStefano Zampini 
43245b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
43255b7e41feSStefano Zampini @*/
4326ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4327ed502f03SStefano Zampini {
4328ed502f03SStefano Zampini   PetscFunctionBegin;
4329ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4330ed502f03SStefano Zampini   PetscValidPointer(a,2);
4331ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
43329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
43339566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4334ed502f03SStefano Zampini   *a = NULL;
4335ed502f03SStefano Zampini   PetscFunctionReturn(0);
4336ed502f03SStefano Zampini }
4337ed502f03SStefano Zampini 
4338ed502f03SStefano Zampini struct IJCompare4
4339ed502f03SStefano Zampini {
4340ed502f03SStefano Zampini   __host__ __device__
43412ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4342ed502f03SStefano Zampini   {
4343ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4344ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4345ed502f03SStefano Zampini     return false;
4346ed502f03SStefano Zampini   }
4347ed502f03SStefano Zampini };
4348ed502f03SStefano Zampini 
43498909a122SStefano Zampini struct Shift
43508909a122SStefano Zampini {
4351ed502f03SStefano Zampini   int _shift;
4352ed502f03SStefano Zampini 
4353ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4354ed502f03SStefano Zampini   __host__ __device__
4355ed502f03SStefano Zampini   inline int operator() (const int &c)
4356ed502f03SStefano Zampini   {
4357ed502f03SStefano Zampini     return c + _shift;
4358ed502f03SStefano Zampini   }
4359ed502f03SStefano Zampini };
4360ed502f03SStefano Zampini 
4361ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4362ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4363ed502f03SStefano Zampini {
4364ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4365ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4366ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4367ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4368ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4369ed502f03SStefano Zampini   cusparseStatus_t             stat;
4370ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4371ed502f03SStefano Zampini 
4372ed502f03SStefano Zampini   PetscFunctionBegin;
4373ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4374ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4375ed502f03SStefano Zampini   PetscValidPointer(C,4);
4376ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4377ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
43785f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4379*08401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
43802c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
43812c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4382ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4383ed502f03SStefano Zampini     m     = A->rmap->n;
4384ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
43859566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,C));
43869566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C,m,n,m,n));
43879566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4388ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4389ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4390ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4391ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4392ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4393ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4394ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4395ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4396ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4397ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4398ed502f03SStefano Zampini     Ccusp->nrows    = m;
4399ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4400ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4401ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4402ed502f03SStefano Zampini     Ccsr->num_cols  = n;
44039566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
44049566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
44059566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
44069566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
44079566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
44089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
44099566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44109566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
44119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44139566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
441428b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
441528b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4416ed502f03SStefano Zampini 
4417ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4418ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4419ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4420ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4421ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4422ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4423ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4424ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4425ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4426ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4427ed502f03SStefano Zampini     if (c->nz) {
44282ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44292ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44302ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44312ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44322ed87e7eSStefano Zampini 
4433ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4434ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4435ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4436ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
44379566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4438ed502f03SStefano Zampini         }
44392ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44402ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4441ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4442ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4443ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4444ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
44459566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4446ed502f03SStefano Zampini         }
44472ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44482ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
44499566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
44502ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44512ed87e7eSStefano Zampini                               Aroff->data().get(),
44522ed87e7eSStefano Zampini                               Annz,
44532ed87e7eSStefano Zampini                               m,
44542ed87e7eSStefano Zampini                               Acoo->data().get(),
44559566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4456ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44572ed87e7eSStefano Zampini                               Broff->data().get(),
4458ed502f03SStefano Zampini                               Bnnz,
4459ed502f03SStefano Zampini                               m,
44602ed87e7eSStefano Zampini                               Bcoo->data().get(),
44619566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
44622ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44632ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44642ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44658909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4466ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4467ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44688909a122SStefano Zampini #else
44698909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44708909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44718909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44728909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44738909a122SStefano Zampini #endif
44742ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44752ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44762ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44772ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
44782ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
44792ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4480ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4481ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4482ed502f03SStefano Zampini       thrust::advance(p2,Annz);
44832ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
44848909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
44858909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
44868909a122SStefano Zampini #endif
44872ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
44882ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
44892ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
44902ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
44912ed87e7eSStefano Zampini #else
44922ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
44932ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
44942ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
44952ed87e7eSStefano Zampini #endif
4496ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
44972ed87e7eSStefano Zampini                               Ccoo->data().get(),
4498ed502f03SStefano Zampini                               c->nz,
4499ed502f03SStefano Zampini                               m,
4500ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
45019566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
45029566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
45032ed87e7eSStefano Zampini       delete wPerm;
45042ed87e7eSStefano Zampini       delete Acoo;
45052ed87e7eSStefano Zampini       delete Bcoo;
45062ed87e7eSStefano Zampini       delete Ccoo;
4507ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4508ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4509ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4510ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
45119566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4512ed502f03SStefano Zampini #endif
45131a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45149566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
45159566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4516ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4517ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4518ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4519ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4520ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4521ed502f03SStefano Zampini 
45221a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45231a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4524a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4525ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4526ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4527ed502f03SStefano Zampini         CcsrT->num_rows = n;
4528ed502f03SStefano Zampini         CcsrT->num_cols = m;
4529ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4530ed502f03SStefano Zampini 
4531ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4532ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4533ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4534ed502f03SStefano Zampini 
45359566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4536ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4537ed502f03SStefano Zampini         if (AT) {
4538ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4539ed502f03SStefano Zampini           thrust::advance(rT,-1);
4540ed502f03SStefano Zampini         }
4541ed502f03SStefano Zampini         if (BT) {
4542ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4543ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4544ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4545ed502f03SStefano Zampini         }
4546ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4547ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4548ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4549ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4550ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4551ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
45529566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4553ed502f03SStefano Zampini 
45549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
45559566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
45569566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
45579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
45589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
45599566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
45609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
45619566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
45629566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4563ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4564ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4565ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4566ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
45679566063dSJacob Faibussowitsch                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4568ed502f03SStefano Zampini #endif
4569ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4570ed502f03SStefano Zampini       }
4571ed502f03SStefano Zampini     }
4572ed502f03SStefano Zampini 
4573ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4574ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4575ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
45769566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m+1,&c->i));
45779566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->j));
4578ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4579ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4580ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4581ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4582ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
45839566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4585ed502f03SStefano Zampini     } else {
45869566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45879566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4588ed502f03SStefano Zampini     }
45899566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
45909566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->ilen));
45919566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->imax));
4592ed502f03SStefano Zampini     c->maxnz = c->nz;
4593ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4594ed502f03SStefano Zampini     c->rmax = 0;
4595ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4596ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4597ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4598ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4599ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4600ed502f03SStefano Zampini     }
46019566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
46029566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->a));
4603ed502f03SStefano Zampini     (*C)->nonzerostate++;
46049566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
46059566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4606ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4607ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4608ed502f03SStefano Zampini   } else {
4609*08401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4610ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4611ed502f03SStefano Zampini     if (c->nz) {
4612ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
46135f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
46142c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4615*08401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
46169566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46179566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
46185f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
46195f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4620ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4621ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4622ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
46232c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
46242c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
46252c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
46262c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
46275f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4628ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4629ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
46309566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
4631ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4632ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4633ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4634ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4635ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4636ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4637ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4638ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4639ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4640ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
46419566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
46421a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
46435f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4644ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4645ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4646ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4647ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4648ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4649ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4650ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46511a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4652ed502f03SStefano Zampini       }
46539566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4654ed502f03SStefano Zampini     }
4655ed502f03SStefano Zampini   }
46569566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4657ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4658ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4659ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4660ed502f03SStefano Zampini   PetscFunctionReturn(0);
4661ed502f03SStefano Zampini }
4662c215019aSStefano Zampini 
4663c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4664c215019aSStefano Zampini {
4665c215019aSStefano Zampini   bool              dmem;
4666c215019aSStefano Zampini   const PetscScalar *av;
4667c215019aSStefano Zampini 
4668c215019aSStefano Zampini   PetscFunctionBegin;
4669c215019aSStefano Zampini   dmem = isCudaMem(v);
46709566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4671c215019aSStefano Zampini   if (n && idx) {
4672c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4673c215019aSStefano Zampini     widx.assign(idx,idx+n);
46749566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4675c215019aSStefano Zampini 
4676c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4677c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4678c215019aSStefano Zampini     if (dmem) {
4679c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4680c215019aSStefano Zampini     } else {
4681c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4682c215019aSStefano Zampini       dv = w->data();
4683c215019aSStefano Zampini     }
4684c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4685c215019aSStefano Zampini 
4686c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4687c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4688c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4689c215019aSStefano Zampini     if (w) {
46909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4691c215019aSStefano Zampini     }
4692c215019aSStefano Zampini     delete w;
4693c215019aSStefano Zampini   } else {
46949566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4695c215019aSStefano Zampini   }
46969566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
46979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4698c215019aSStefano Zampini   PetscFunctionReturn(0);
4699c215019aSStefano Zampini }
4700