xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision d0609ced746bc51b019815ca91d747429db24893) !
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94c215019aSStefano Zampini 
95ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
969ae82921SPaul Mullowney {
979ae82921SPaul Mullowney   PetscFunctionBegin;
989ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
999ae82921SPaul Mullowney   PetscFunctionReturn(0);
1009ae82921SPaul Mullowney }
1019ae82921SPaul Mullowney 
102c708e6cdSJed Brown /*MC
103087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
104087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
105087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
106087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
107087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
108087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
109c708e6cdSJed Brown 
1109ae82921SPaul Mullowney   Level: beginner
111c708e6cdSJed Brown 
1123ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
113c708e6cdSJed Brown M*/
1149ae82921SPaul Mullowney 
11542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1169ae82921SPaul Mullowney {
117bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1189ae82921SPaul Mullowney 
1199ae82921SPaul Mullowney   PetscFunctionBegin;
1209566063dSJacob Faibussowitsch   PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B));
1219566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*B,n,n,n,n));
1222c7c0729SBarry Smith   (*B)->factortype = ftype;
1239566063dSJacob Faibussowitsch   PetscCall(MatSetType(*B,MATSEQAIJCUSPARSE));
1242205254eSKarl Rupp 
1259566063dSJacob Faibussowitsch   if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B,PETSC_TRUE));
126087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1279566063dSJacob Faibussowitsch     PetscCall(MatSetBlockSizesFromMats(*B,A,A));
1289c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1299ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1309ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1319c1083e7SRichard Tran Mills     } else {
1329c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1339c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1349c1083e7SRichard Tran Mills     }
1359566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1369566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1379566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
138087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1399c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
140087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
141087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1429c1083e7SRichard Tran Mills     } else {
1439c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1449c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1459c1083e7SRichard Tran Mills     }
1469566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1479566063dSJacob Faibussowitsch     PetscCall(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1489ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
149bc3f50f2SPaul Mullowney 
1509566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1514ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
1539ae82921SPaul Mullowney   PetscFunctionReturn(0);
1549ae82921SPaul Mullowney }
1559ae82921SPaul Mullowney 
156bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
157ca45077fSPaul Mullowney {
158aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1596e111a19SKarl Rupp 
160ca45077fSPaul Mullowney   PetscFunctionBegin;
161ca45077fSPaul Mullowney   switch (op) {
162e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
163aa372e3fSPaul Mullowney     cusparsestruct->format = format;
164ca45077fSPaul Mullowney     break;
165e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
166aa372e3fSPaul Mullowney     cusparsestruct->format = format;
167ca45077fSPaul Mullowney     break;
168ca45077fSPaul Mullowney   default:
16998921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
170ca45077fSPaul Mullowney   }
171ca45077fSPaul Mullowney   PetscFunctionReturn(0);
172ca45077fSPaul Mullowney }
1739ae82921SPaul Mullowney 
174e057df02SPaul Mullowney /*@
175e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
176e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
177aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
178e057df02SPaul Mullowney    Not Collective
179e057df02SPaul Mullowney 
180e057df02SPaul Mullowney    Input Parameters:
1818468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
18236d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1832692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
184e057df02SPaul Mullowney 
185e057df02SPaul Mullowney    Output Parameter:
186e057df02SPaul Mullowney 
187e057df02SPaul Mullowney    Level: intermediate
188e057df02SPaul Mullowney 
1898468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
190e057df02SPaul Mullowney @*/
191e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192e057df02SPaul Mullowney {
193e057df02SPaul Mullowney   PetscFunctionBegin;
194e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
195cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));
196e057df02SPaul Mullowney   PetscFunctionReturn(0);
197e057df02SPaul Mullowney }
198e057df02SPaul Mullowney 
199365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
200365b711fSMark Adams {
201365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
202365b711fSMark Adams 
203365b711fSMark Adams   PetscFunctionBegin;
204365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
205365b711fSMark Adams   PetscFunctionReturn(0);
206365b711fSMark Adams }
207365b711fSMark Adams 
208365b711fSMark Adams /*@
209365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
210365b711fSMark Adams 
211365b711fSMark Adams    Input Parameters:
212365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
213365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
214365b711fSMark Adams 
215365b711fSMark Adams    Output Parameter:
216365b711fSMark Adams 
217365b711fSMark Adams    Notes:
218365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
219365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
220365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
221365b711fSMark Adams 
222365b711fSMark Adams    Level: intermediate
223365b711fSMark Adams 
224365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
225365b711fSMark Adams @*/
226365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
227365b711fSMark Adams {
228365b711fSMark Adams   PetscFunctionBegin;
229365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
230cac4c232SBarry Smith   PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));
231365b711fSMark Adams   PetscFunctionReturn(0);
232365b711fSMark Adams }
233365b711fSMark Adams 
2341a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
235e6e9a74fSStefano Zampini {
236e6e9a74fSStefano Zampini   PetscFunctionBegin;
2371a2c6b5cSJunchao Zhang   switch (op) {
2381a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2391a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2409566063dSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2411a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2421a2c6b5cSJunchao Zhang       break;
2431a2c6b5cSJunchao Zhang     default:
2449566063dSJacob Faibussowitsch       PetscCall(MatSetOption_SeqAIJ(A,op,flg));
2451a2c6b5cSJunchao Zhang       break;
246e6e9a74fSStefano Zampini   }
247e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
248e6e9a74fSStefano Zampini }
249e6e9a74fSStefano Zampini 
250bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
251bddcd29dSMark Adams 
252bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
253bddcd29dSMark Adams {
254bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
255bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
256bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
257365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
258bddcd29dSMark Adams 
259bddcd29dSMark Adams   PetscFunctionBegin;
2609566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
2619566063dSJacob Faibussowitsch   PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info));
262bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
263bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
2649566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
2659566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
266bddcd29dSMark Adams   if (row_identity && col_identity) {
267365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
268bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
269bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
270365b711fSMark Adams     }
271bddcd29dSMark Adams     B->ops->matsolve = NULL;
272bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
273bddcd29dSMark Adams   } else {
274365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
275bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
276bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
277365b711fSMark Adams     }
278bddcd29dSMark Adams     B->ops->matsolve = NULL;
279bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
280bddcd29dSMark Adams   }
281bddcd29dSMark Adams 
282bddcd29dSMark Adams   /* get the triangular factors */
283365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
2849566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
285365b711fSMark Adams   }
286bddcd29dSMark Adams   PetscFunctionReturn(0);
287bddcd29dSMark Adams }
288bddcd29dSMark Adams 
2894416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2909ae82921SPaul Mullowney {
291e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2929ae82921SPaul Mullowney   PetscBool                flg;
293a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2946e111a19SKarl Rupp 
2959ae82921SPaul Mullowney   PetscFunctionBegin;
296*d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject,"SeqAIJCUSPARSE options");
2979ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
298*d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
299*d0609cedSBarry Smith                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
3009566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
301afb2bd1cSJunchao Zhang 
302*d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
303*d0609cedSBarry Smith                                "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg));
3049566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
3059566063dSJacob Faibussowitsch     PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
3069566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
307afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
308*d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
309*d0609cedSBarry Smith                                "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg));
310afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
3118efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3122c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
313a435da06SStefano Zampini #else
3142c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
315a435da06SStefano Zampini #endif
316*d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
317*d0609cedSBarry Smith                                "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg));
3182c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
319afb2bd1cSJunchao Zhang 
320*d0609cedSBarry Smith     PetscCall(PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
321*d0609cedSBarry Smith                                "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg));
3222c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
323afb2bd1cSJunchao Zhang    #endif
3244c87dfd4SPaul Mullowney   }
325*d0609cedSBarry Smith   PetscOptionsHeadEnd();
3269ae82921SPaul Mullowney   PetscFunctionReturn(0);
3279ae82921SPaul Mullowney }
3289ae82921SPaul Mullowney 
3296fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3309ae82921SPaul Mullowney {
331da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3329ae82921SPaul Mullowney 
3339ae82921SPaul Mullowney   PetscFunctionBegin;
3349566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3359566063dSJacob Faibussowitsch   PetscCall(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3369ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3379ae82921SPaul Mullowney   PetscFunctionReturn(0);
3389ae82921SPaul Mullowney }
3399ae82921SPaul Mullowney 
3406fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3419ae82921SPaul Mullowney {
342da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3439ae82921SPaul Mullowney 
3449ae82921SPaul Mullowney   PetscFunctionBegin;
3459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3469566063dSJacob Faibussowitsch   PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3479ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3489ae82921SPaul Mullowney   PetscFunctionReturn(0);
3499ae82921SPaul Mullowney }
3509ae82921SPaul Mullowney 
351087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
352087f3262SPaul Mullowney {
353da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
354087f3262SPaul Mullowney 
355087f3262SPaul Mullowney   PetscFunctionBegin;
3569566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3579566063dSJacob Faibussowitsch   PetscCall(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
358087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
359087f3262SPaul Mullowney   PetscFunctionReturn(0);
360087f3262SPaul Mullowney }
361087f3262SPaul Mullowney 
362087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
363087f3262SPaul Mullowney {
364da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
365087f3262SPaul Mullowney 
366087f3262SPaul Mullowney   PetscFunctionBegin;
3679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3689566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
369087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
370087f3262SPaul Mullowney   PetscFunctionReturn(0);
371087f3262SPaul Mullowney }
372087f3262SPaul Mullowney 
373087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3749ae82921SPaul Mullowney {
3759ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3769ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3779ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
378aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3799ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3809ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3819ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3829ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney   PetscFunctionBegin;
385cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
386c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3879ae82921SPaul Mullowney     try {
3889ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3899ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
390da79fbbcSStefano Zampini       if (!loTriFactor) {
3912cbc15d9SMark         PetscScalar                       *AALo;
3922cbc15d9SMark 
3939566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
3969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
3979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
3989ae82921SPaul Mullowney 
3999ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4009ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4019ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4029ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4039ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4049ae82921SPaul Mullowney         v        = aa;
4059ae82921SPaul Mullowney         vi       = aj;
4069ae82921SPaul Mullowney         offset   = 1;
4079ae82921SPaul Mullowney         rowOffset= 1;
4089ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4099ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
410e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4119ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4129ae82921SPaul Mullowney           rowOffset += nz+1;
4139ae82921SPaul Mullowney 
4149566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjLo[offset]), vi, nz));
4159566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AALo[offset]), v, nz));
4169ae82921SPaul Mullowney 
4179ae82921SPaul Mullowney           offset      += nz;
4189ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4199ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4209ae82921SPaul Mullowney           offset      += 1;
4219ae82921SPaul Mullowney 
4229ae82921SPaul Mullowney           v  += nz;
4239ae82921SPaul Mullowney           vi += nz;
4249ae82921SPaul Mullowney         }
4252205254eSKarl Rupp 
426aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4279566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
428da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
429aa372e3fSPaul Mullowney         /* Create the matrix description */
4309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
4319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4321b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4339566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
434afb2bd1cSJunchao Zhang        #else
4359566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
436afb2bd1cSJunchao Zhang        #endif
4379566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
4389566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
439aa372e3fSPaul Mullowney 
440aa372e3fSPaul Mullowney         /* set the operation */
441aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
442aa372e3fSPaul Mullowney 
443aa372e3fSPaul Mullowney         /* set the matrix */
444aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
445aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
446aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
447aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
448aa372e3fSPaul Mullowney 
449aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
450aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
451aa372e3fSPaul Mullowney 
452aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
453aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
454aa372e3fSPaul Mullowney 
455aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
456aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
457aa372e3fSPaul Mullowney 
458afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4599566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
4609566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
4611b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
463afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
464afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
465afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
4665f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
4679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
468afb2bd1cSJunchao Zhang       #endif
469afb2bd1cSJunchao Zhang 
470aa372e3fSPaul Mullowney         /* perform the solve analysis */
4719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
472aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
473aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
474d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
4751b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
476d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
4775f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
478d49cd2b7SBarry Smith                                          #else
4795f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
480afb2bd1cSJunchao Zhang                                          #endif
4819566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
4829566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
483aa372e3fSPaul Mullowney 
484da79fbbcSStefano Zampini         /* assign the pointer */
485aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4862cbc15d9SMark         loTriFactor->AA_h = AALo;
4879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiLo));
4889566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjLo));
4899566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
490da79fbbcSStefano Zampini       } else { /* update values only */
4912cbc15d9SMark         if (!loTriFactor->AA_h) {
4929566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
4932cbc15d9SMark         }
494da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4952cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
496da79fbbcSStefano Zampini         v        = aa;
497da79fbbcSStefano Zampini         vi       = aj;
498da79fbbcSStefano Zampini         offset   = 1;
499da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
500da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5019566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
502da79fbbcSStefano Zampini           offset      += nz;
5032cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
504da79fbbcSStefano Zampini           offset      += 1;
505da79fbbcSStefano Zampini           v  += nz;
506da79fbbcSStefano Zampini         }
5072cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
5089566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
509da79fbbcSStefano Zampini       }
5109ae82921SPaul Mullowney     } catch(char *ex) {
51198921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5129ae82921SPaul Mullowney     }
5139ae82921SPaul Mullowney   }
5149ae82921SPaul Mullowney   PetscFunctionReturn(0);
5159ae82921SPaul Mullowney }
5169ae82921SPaul Mullowney 
517087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5189ae82921SPaul Mullowney {
5199ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5209ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5219ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
522aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5239ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5249ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5259ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5269ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5279ae82921SPaul Mullowney 
5289ae82921SPaul Mullowney   PetscFunctionBegin;
529cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
530c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5319ae82921SPaul Mullowney     try {
5329ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5339ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
534da79fbbcSStefano Zampini       if (!upTriFactor) {
5352cbc15d9SMark         PetscScalar *AAUp;
5362cbc15d9SMark 
5379566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
5382cbc15d9SMark 
5399ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5409566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
5419566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
5429ae82921SPaul Mullowney 
5439ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5449ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5459ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5469ae82921SPaul Mullowney         offset = nzUpper;
5479ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5489ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5499ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5509ae82921SPaul Mullowney 
551e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5529ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5539ae82921SPaul Mullowney 
554e057df02SPaul Mullowney           /* decrement the offset */
5559ae82921SPaul Mullowney           offset -= (nz+1);
5569ae82921SPaul Mullowney 
557e057df02SPaul Mullowney           /* first, set the diagonal elements */
5589ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
55909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5609ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5619ae82921SPaul Mullowney 
5629566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
5639566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(AAUp[offset+1]), v, nz));
5649ae82921SPaul Mullowney         }
5652205254eSKarl Rupp 
566aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
5679566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
568da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5692205254eSKarl Rupp 
570aa372e3fSPaul Mullowney         /* Create the matrix description */
5719566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
5729566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
5731b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
5749566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
575afb2bd1cSJunchao Zhang        #else
5769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
577afb2bd1cSJunchao Zhang        #endif
5789566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
5799566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
580aa372e3fSPaul Mullowney 
581aa372e3fSPaul Mullowney         /* set the operation */
582aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
583aa372e3fSPaul Mullowney 
584aa372e3fSPaul Mullowney         /* set the matrix */
585aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
586aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
587aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
589aa372e3fSPaul Mullowney 
590aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
591aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
592aa372e3fSPaul Mullowney 
593aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
594aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
595aa372e3fSPaul Mullowney 
596aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
597aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
598aa372e3fSPaul Mullowney 
599afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6009566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
6019566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
6021b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
6039566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
604afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
605afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
606afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
6075f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
6089566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
609afb2bd1cSJunchao Zhang       #endif
610afb2bd1cSJunchao Zhang 
611aa372e3fSPaul Mullowney         /* perform the solve analysis */
6129566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
613aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
614aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
615d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
6161b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
617d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
6185f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
619d49cd2b7SBarry Smith                                          #else
6205f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
621afb2bd1cSJunchao Zhang                                          #endif
6229566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
6239566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
624aa372e3fSPaul Mullowney 
625da79fbbcSStefano Zampini         /* assign the pointer */
626aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6272cbc15d9SMark         upTriFactor->AA_h = AAUp;
6289566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
6299566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
6309566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
631da79fbbcSStefano Zampini       } else {
6322cbc15d9SMark         if (!upTriFactor->AA_h) {
6339566063dSJacob Faibussowitsch           PetscCallCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
6342cbc15d9SMark         }
635da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
636da79fbbcSStefano Zampini         offset = nzUpper;
637da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
638da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
639da79fbbcSStefano Zampini 
640da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
641da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
642da79fbbcSStefano Zampini 
643da79fbbcSStefano Zampini           /* decrement the offset */
644da79fbbcSStefano Zampini           offset -= (nz+1);
645da79fbbcSStefano Zampini 
646da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6472cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6489566063dSJacob Faibussowitsch           PetscCall(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
649da79fbbcSStefano Zampini         }
6502cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
6519566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
652da79fbbcSStefano Zampini       }
6539ae82921SPaul Mullowney     } catch(char *ex) {
65498921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6559ae82921SPaul Mullowney     }
6569ae82921SPaul Mullowney   }
6579ae82921SPaul Mullowney   PetscFunctionReturn(0);
6589ae82921SPaul Mullowney }
6599ae82921SPaul Mullowney 
660087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6619ae82921SPaul Mullowney {
6629ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6639ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6649ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6659ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6669ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6679ae82921SPaul Mullowney 
6689ae82921SPaul Mullowney   PetscFunctionBegin;
66928b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
6709566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
6719566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
6722205254eSKarl Rupp 
673da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
674aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6759ae82921SPaul Mullowney 
676c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
677e057df02SPaul Mullowney   /* lower triangular indices */
6789566063dSJacob Faibussowitsch   PetscCall(ISIdentity(isrow,&row_identity));
679da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
680da79fbbcSStefano Zampini     const PetscInt *r;
681da79fbbcSStefano Zampini 
6829566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(isrow,&r));
683aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
684aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6859566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(isrow,&r));
6869566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
687da79fbbcSStefano Zampini   }
6889ae82921SPaul Mullowney 
689e057df02SPaul Mullowney   /* upper triangular indices */
6909566063dSJacob Faibussowitsch   PetscCall(ISIdentity(iscol,&col_identity));
691da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
692da79fbbcSStefano Zampini     const PetscInt *c;
693da79fbbcSStefano Zampini 
6949566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iscol,&c));
695aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
696aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6979566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iscol,&c));
6989566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
699da79fbbcSStefano Zampini   }
7009ae82921SPaul Mullowney   PetscFunctionReturn(0);
7019ae82921SPaul Mullowney }
7029ae82921SPaul Mullowney 
703087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
704087f3262SPaul Mullowney {
705087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
706087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
707aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
708aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
709087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
710087f3262SPaul Mullowney   PetscScalar                       *AAUp;
711087f3262SPaul Mullowney   PetscScalar                       *AALo;
712087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
713087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
714087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
715087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
716087f3262SPaul Mullowney 
717087f3262SPaul Mullowney   PetscFunctionBegin;
718cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
719c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
720087f3262SPaul Mullowney     try {
7219566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
7229566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
723da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
724087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7259566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
7269566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
727087f3262SPaul Mullowney 
728087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
729087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
730087f3262SPaul Mullowney         AiUp[n]=nzUpper;
731087f3262SPaul Mullowney         offset = 0;
732087f3262SPaul Mullowney         for (i=0; i<n; i++) {
733087f3262SPaul Mullowney           /* set the pointers */
734087f3262SPaul Mullowney           v  = aa + ai[i];
735087f3262SPaul Mullowney           vj = aj + ai[i];
736087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
737087f3262SPaul Mullowney 
738087f3262SPaul Mullowney           /* first, set the diagonal elements */
739087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
74009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
741087f3262SPaul Mullowney           AiUp[i]      = offset;
74209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
743087f3262SPaul Mullowney 
744087f3262SPaul Mullowney           offset+=1;
745087f3262SPaul Mullowney           if (nz>0) {
7469566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AjUp[offset]), vj, nz));
7479566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
748087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
749087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
750087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
751087f3262SPaul Mullowney             }
752087f3262SPaul Mullowney             offset+=nz;
753087f3262SPaul Mullowney           }
754087f3262SPaul Mullowney         }
755087f3262SPaul Mullowney 
756aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
7579566063dSJacob Faibussowitsch         PetscCall(PetscNew(&upTriFactor));
758da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
759087f3262SPaul Mullowney 
760aa372e3fSPaul Mullowney         /* Create the matrix description */
7619566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
7629566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
7631b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
7649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
765afb2bd1cSJunchao Zhang        #else
7669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
767afb2bd1cSJunchao Zhang        #endif
7689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
7699566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
770087f3262SPaul Mullowney 
771aa372e3fSPaul Mullowney         /* set the matrix */
772aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
773aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
774aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
775aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
776aa372e3fSPaul Mullowney 
777aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
778aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
779aa372e3fSPaul Mullowney 
780aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
781aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
782aa372e3fSPaul Mullowney 
783aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
784aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
785aa372e3fSPaul Mullowney 
786afb2bd1cSJunchao Zhang         /* set the operation */
787afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
788afb2bd1cSJunchao Zhang 
789afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
7909566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
7919566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
7921b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
7939566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
794afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
795afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
796afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
7975f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
7989566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
799afb2bd1cSJunchao Zhang       #endif
800afb2bd1cSJunchao Zhang 
801aa372e3fSPaul Mullowney         /* perform the solve analysis */
8029566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
803aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
804aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
805d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
8061b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
807d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
8085f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
809d49cd2b7SBarry Smith                                          #else
8105f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
811afb2bd1cSJunchao Zhang                                          #endif
8129566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8139566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
814aa372e3fSPaul Mullowney 
815da79fbbcSStefano Zampini         /* assign the pointer */
816aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
817aa372e3fSPaul Mullowney 
818aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8199566063dSJacob Faibussowitsch         PetscCall(PetscNew(&loTriFactor));
820da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
821aa372e3fSPaul Mullowney 
822aa372e3fSPaul Mullowney         /* Create the matrix description */
8239566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8249566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8251b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8269566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
827afb2bd1cSJunchao Zhang        #else
8289566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
829afb2bd1cSJunchao Zhang        #endif
8309566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8319566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
832aa372e3fSPaul Mullowney 
833aa372e3fSPaul Mullowney         /* set the operation */
834aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
835aa372e3fSPaul Mullowney 
836aa372e3fSPaul Mullowney         /* set the matrix */
837aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
838aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
839aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
840aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
841aa372e3fSPaul Mullowney 
842aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
843aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
846aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
847aa372e3fSPaul Mullowney 
848aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
849aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
850aa372e3fSPaul Mullowney 
851afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8529566063dSJacob Faibussowitsch         PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
8539566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
8541b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8559566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
856afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
857afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
858afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
8595f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
8609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
861afb2bd1cSJunchao Zhang       #endif
862afb2bd1cSJunchao Zhang 
863aa372e3fSPaul Mullowney         /* perform the solve analysis */
8649566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
865aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
866aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
867d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
8681b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
869d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
8705f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
871d49cd2b7SBarry Smith                                          #else
8725f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
873afb2bd1cSJunchao Zhang                                          #endif
8749566063dSJacob Faibussowitsch         PetscCallCUDA(WaitForCUDA());
8759566063dSJacob Faibussowitsch         PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
876aa372e3fSPaul Mullowney 
877da79fbbcSStefano Zampini         /* assign the pointer */
878aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
879087f3262SPaul Mullowney 
8809566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
8819566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AiUp));
8829566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFreeHost(AjUp));
883da79fbbcSStefano Zampini       } else {
884da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
885da79fbbcSStefano Zampini         offset = 0;
886da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
887da79fbbcSStefano Zampini           /* set the pointers */
888da79fbbcSStefano Zampini           v  = aa + ai[i];
889da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
890da79fbbcSStefano Zampini 
891da79fbbcSStefano Zampini           /* first, set the diagonal elements */
892da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
893da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
894da79fbbcSStefano Zampini 
895da79fbbcSStefano Zampini           offset+=1;
896da79fbbcSStefano Zampini           if (nz>0) {
8979566063dSJacob Faibussowitsch             PetscCall(PetscArraycpy(&(AAUp[offset]), v, nz));
898da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
899da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
900da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
901da79fbbcSStefano Zampini             }
902da79fbbcSStefano Zampini             offset+=nz;
903da79fbbcSStefano Zampini           }
904da79fbbcSStefano Zampini         }
90528b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
90628b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
908da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
9099566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
910da79fbbcSStefano Zampini       }
9119566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AAUp));
9129566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFreeHost(AALo));
913087f3262SPaul Mullowney     } catch(char *ex) {
91498921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
915087f3262SPaul Mullowney     }
916087f3262SPaul Mullowney   }
917087f3262SPaul Mullowney   PetscFunctionReturn(0);
918087f3262SPaul Mullowney }
919087f3262SPaul Mullowney 
920087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9219ae82921SPaul Mullowney {
922087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
923087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
924087f3262SPaul Mullowney   IS                           ip = a->row;
925087f3262SPaul Mullowney   PetscBool                    perm_identity;
926087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
927087f3262SPaul Mullowney 
928087f3262SPaul Mullowney   PetscFunctionBegin;
92928b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
9309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
931da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
932aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
933aa372e3fSPaul Mullowney 
934da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
935da79fbbcSStefano Zampini 
936087f3262SPaul Mullowney   /* lower triangular indices */
9379566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
938087f3262SPaul Mullowney   if (!perm_identity) {
9394e4bbfaaSStefano Zampini     IS             iip;
940da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9414e4bbfaaSStefano Zampini 
9429566063dSJacob Faibussowitsch     PetscCall(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
9439566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(iip,&irip));
9449566063dSJacob Faibussowitsch     PetscCall(ISGetIndices(ip,&rip));
945aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
946aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
947aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9484e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9499566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(iip,&irip));
9509566063dSJacob Faibussowitsch     PetscCall(ISDestroy(&iip));
9519566063dSJacob Faibussowitsch     PetscCall(ISRestoreIndices(ip,&rip));
9529566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
953da79fbbcSStefano Zampini   }
954087f3262SPaul Mullowney   PetscFunctionReturn(0);
955087f3262SPaul Mullowney }
956087f3262SPaul Mullowney 
957087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
958087f3262SPaul Mullowney {
959087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
960087f3262SPaul Mullowney   IS             ip = b->row;
961087f3262SPaul Mullowney   PetscBool      perm_identity;
962087f3262SPaul Mullowney 
963087f3262SPaul Mullowney   PetscFunctionBegin;
9649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
9659566063dSJacob Faibussowitsch   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
966ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
967087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9689566063dSJacob Faibussowitsch   PetscCall(ISIdentity(ip,&perm_identity));
969087f3262SPaul Mullowney   if (perm_identity) {
970087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
971087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9724e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9734e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
974087f3262SPaul Mullowney   } else {
975087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
976087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9774e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9784e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
979087f3262SPaul Mullowney   }
980087f3262SPaul Mullowney 
981087f3262SPaul Mullowney   /* get the triangular factors */
9829566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
983087f3262SPaul Mullowney   PetscFunctionReturn(0);
984087f3262SPaul Mullowney }
9859ae82921SPaul Mullowney 
986b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
987bda325fcSPaul Mullowney {
988bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
989aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
990aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
991da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
992da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
993aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
994aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
995aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
996aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
997b175d8bbSPaul Mullowney 
998bda325fcSPaul Mullowney   PetscFunctionBegin;
999aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10009566063dSJacob Faibussowitsch   PetscCall(PetscNew(&loTriFactorT));
1001da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002aa372e3fSPaul Mullowney 
1003aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1004aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1005aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009aa372e3fSPaul Mullowney 
1010aa372e3fSPaul Mullowney   /* Create the matrix description */
10119566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10129566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10139566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10149566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10159566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* set the operation */
1018aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019aa372e3fSPaul Mullowney 
1020aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1021aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1022afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028aa372e3fSPaul Mullowney 
1029aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
10319566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(),
1034afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->row_offsets->data().get(),
1035afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(),
1036afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->values->data().get(),
1037afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
10395f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10409566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1041afb2bd1cSJunchao Zhang #endif
1042afb2bd1cSJunchao Zhang 
10439566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
10449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->values->data().get(),
1047aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->row_offsets->data().get(),
1048aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->column_indices->data().get(),
1049aa372e3fSPaul Mullowney                                   loTriFactorT->csrMat->values->data().get(),
1050afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
10535f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1054afb2bd1cSJunchao Zhang                                   #else
1055afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10565f80ce2aSJacob Faibussowitsch                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1057afb2bd1cSJunchao Zhang                                   #endif
10589566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10599566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1060aa372e3fSPaul Mullowney 
1061afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
10629566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
10639566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
10641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
10659566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1066afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1067afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
10695f80ce2aSJacob Faibussowitsch                                          &loTriFactorT->solveBufferSize));
10709566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1071afb2bd1cSJunchao Zhang #endif
1072afb2bd1cSJunchao Zhang 
1073afb2bd1cSJunchao Zhang   /* perform the solve analysis */
10749566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1075afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1076afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077d49cd2b7SBarry Smith                                    loTriFactorT->csrMat->column_indices->data().get(),
10781b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1079d49cd2b7SBarry Smith                                    loTriFactorT->solveInfo,
10805f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1081d49cd2b7SBarry Smith                                    #else
10825f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solveInfo));
1083afb2bd1cSJunchao Zhang                                    #endif
10849566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
10859566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1086aa372e3fSPaul Mullowney 
1087da79fbbcSStefano Zampini   /* assign the pointer */
1088aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1089aa372e3fSPaul Mullowney 
1090aa372e3fSPaul Mullowney   /*********************************************/
1091aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1092aa372e3fSPaul Mullowney   /*********************************************/
1093aa372e3fSPaul Mullowney 
1094aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
10959566063dSJacob Faibussowitsch   PetscCall(PetscNew(&upTriFactorT));
1096da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1097aa372e3fSPaul Mullowney 
1098aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1099aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1100aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1101aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1102aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1103aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1104aa372e3fSPaul Mullowney 
1105aa372e3fSPaul Mullowney   /* Create the matrix description */
11069566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11079566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11089566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11099566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11109566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1111aa372e3fSPaul Mullowney 
1112aa372e3fSPaul Mullowney   /* set the operation */
1113aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1114aa372e3fSPaul Mullowney 
1115aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1116aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1117afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1118afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1119aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1120afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1121afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1122afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1123aa372e3fSPaul Mullowney 
1124aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1125afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
11269566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1127afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1128afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(),
1129afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->row_offsets->data().get(),
1130afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(),
1131afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->values->data().get(),
1132afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1133afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
11345f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11359566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1136afb2bd1cSJunchao Zhang #endif
1137afb2bd1cSJunchao Zhang 
11389566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
11399566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1140aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->values->data().get(),
1142aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->row_offsets->data().get(),
1143aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->column_indices->data().get(),
1144aa372e3fSPaul Mullowney                                   upTriFactorT->csrMat->values->data().get(),
1145afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1146afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1147afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
11485f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1149afb2bd1cSJunchao Zhang                                   #else
1150afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11515f80ce2aSJacob Faibussowitsch                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1152afb2bd1cSJunchao Zhang                                  #endif
1153d49cd2b7SBarry Smith 
11549566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11559566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1156aa372e3fSPaul Mullowney 
1157afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11589566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
11599566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
11601b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
11619566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1162afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1163afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
11655f80ce2aSJacob Faibussowitsch                                          &upTriFactorT->solveBufferSize));
11669566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1167afb2bd1cSJunchao Zhang   #endif
1168afb2bd1cSJunchao Zhang 
1169afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11705f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
11719566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1172afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1173afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1174d49cd2b7SBarry Smith                                    upTriFactorT->csrMat->column_indices->data().get(),
11751b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1176d49cd2b7SBarry Smith                                    upTriFactorT->solveInfo,
11775f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1178d49cd2b7SBarry Smith                                    #else
11795f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solveInfo));
1180afb2bd1cSJunchao Zhang                                    #endif
1181d49cd2b7SBarry Smith 
11829566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
11839566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1184aa372e3fSPaul Mullowney 
1185da79fbbcSStefano Zampini   /* assign the pointer */
1186aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1187bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1188bda325fcSPaul Mullowney }
1189bda325fcSPaul Mullowney 
1190a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1191a49f1ed0SStefano Zampini {
1192a49f1ed0SStefano Zampini   __host__ __device__
1193a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1194a49f1ed0SStefano Zampini   {
1195a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1196a49f1ed0SStefano Zampini   }
1197a49f1ed0SStefano Zampini };
1198a49f1ed0SStefano Zampini 
11993606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1200bda325fcSPaul Mullowney {
1201aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1203bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1204bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1205aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1206b175d8bbSPaul Mullowney 
1207bda325fcSPaul Mullowney   PetscFunctionBegin;
12089566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
1209a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
121028b400f6SJacob Faibussowitsch   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1211a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
121208401ef6SPierre Jolivet   PetscCheck(!A->transupdated || matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12131a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
12149566063dSJacob Faibussowitsch   PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
12159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1216a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
12179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1218a49f1ed0SStefano Zampini   }
1219a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1220aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12219566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1222aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12239566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12249566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1225aa372e3fSPaul Mullowney 
1226b06137fdSPaul Mullowney     /* set alpha and beta */
12279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
12289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
12299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
12309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
12319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
12329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1233b06137fdSPaul Mullowney 
1234aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1235aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1236a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1237554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1238554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1239aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1240a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1241aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1242aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1243a3fdcf43SKarl Rupp 
1244039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
124581902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1246afb2bd1cSJunchao Zhang 
1247afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
12483606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1249afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1250afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1251afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1252afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1253afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
12549566063dSJacob Faibussowitsch                                indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
12553606e59fSJunchao Zhang       #else
12563606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12573606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12583606e59fSJunchao Zhang 
12593606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
12603606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
12613606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
12623606e59fSJunchao Zhang         */
12633606e59fSJunchao Zhang         if (matrixT->num_entries) {
12643606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
12653606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
12663606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
12673606e59fSJunchao Zhang                                  matrixT->values->data().get(),
12683606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
12699566063dSJacob Faibussowitsch                                  indexBase,cusparse_scalartype);PetscCallCUSPARSE(stat);
12703606e59fSJunchao Zhang 
12713606e59fSJunchao Zhang         } else {
12723606e59fSJunchao Zhang           matstructT->matDescr = NULL;
12733606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
12743606e59fSJunchao Zhang         }
12753606e59fSJunchao Zhang       #endif
1276afb2bd1cSJunchao Zhang      #endif
1277aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1278afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1279afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1280afb2bd1cSJunchao Zhang    #else
1281aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
128251c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
128351c6d536SStefano Zampini       /* First convert HYB to CSR */
1284aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1285aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1286aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1287aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1288aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1289aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1290aa372e3fSPaul Mullowney 
1291aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1292aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1293aa372e3fSPaul Mullowney                               temp->values->data().get(),
1294aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
12959566063dSJacob Faibussowitsch                               temp->column_indices->data().get());PetscCallCUSPARSE(stat);
1296aa372e3fSPaul Mullowney 
1297aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1298aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1299aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1300aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1301aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1302aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1303aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1304aa372e3fSPaul Mullowney 
1305aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1306aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1307aa372e3fSPaul Mullowney                               temp->values->data().get(),
1308aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1309aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1310aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1311aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1312aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
13139566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1314aa372e3fSPaul Mullowney 
1315aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1316aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13179566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1318aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1319aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1320aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1321aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1322aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1323aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
13249566063dSJacob Faibussowitsch                               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1325aa372e3fSPaul Mullowney 
1326aa372e3fSPaul Mullowney       /* assign the pointer */
1327aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13281a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1329aa372e3fSPaul Mullowney       /* delete temporaries */
1330aa372e3fSPaul Mullowney       if (tempT) {
1331aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1332aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1333aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1334aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1335087f3262SPaul Mullowney       }
1336aa372e3fSPaul Mullowney       if (temp) {
1337aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1338aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1339aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1340aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1341aa372e3fSPaul Mullowney       }
1342afb2bd1cSJunchao Zhang      #endif
1343aa372e3fSPaul Mullowney     }
1344a49f1ed0SStefano Zampini   }
1345a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1346a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1347a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
134828b400f6SJacob Faibussowitsch     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
134928b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
135028b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
135128b400f6SJacob Faibussowitsch     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
135228b400f6SJacob Faibussowitsch     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
135328b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
135428b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
135528b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1356a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1357a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1358a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
13599566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1360a49f1ed0SStefano Zampini     }
1361a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1362a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1363a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1364a49f1ed0SStefano Zampini 
1365a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1366a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1367a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1368a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1369a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1370a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1371a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1372a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1373a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1374a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1375a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1376a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
13779566063dSJacob Faibussowitsch                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);PetscCallCUSPARSE(stat);
13789566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1379a49f1ed0SStefano Zampini      #endif
1380a49f1ed0SStefano Zampini 
13811a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13821a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13831a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13841a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13851a2c6b5cSJunchao Zhang 
13861a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13871a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13881a2c6b5cSJunchao Zhang         */
13891a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
13901a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
13911a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
13921a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
13931a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1394a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1395a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1396a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1397a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
13989566063dSJacob Faibussowitsch                               cusparsestruct->csr2cscAlg, csr2cscBuffer);PetscCallCUSPARSE(stat);
1399a49f1ed0SStefano Zampini                              #else
1400a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14019566063dSJacob Faibussowitsch                               CUSPARSE_ACTION_NUMERIC, indexBase);PetscCallCUSPARSE(stat);
1402a49f1ed0SStefano Zampini                              #endif
14031a2c6b5cSJunchao Zhang       } else {
14041a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14051a2c6b5cSJunchao Zhang       }
14061a2c6b5cSJunchao Zhang 
1407a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1408a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1409a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
14109566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(csr2cscBuffer));
1411a49f1ed0SStefano Zampini      #endif
1412a49f1ed0SStefano Zampini     }
1413a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1414a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1415a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1416a49f1ed0SStefano Zampini   }
14179566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
14189566063dSJacob Faibussowitsch   PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1419213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1420213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1421aa372e3fSPaul Mullowney   /* assign the pointer */
1422aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14231a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1424bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1425bda325fcSPaul Mullowney }
1426bda325fcSPaul Mullowney 
1427a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14286fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1429bda325fcSPaul Mullowney {
1430c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1431465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1432465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1433465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1434465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1435bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1436bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1437aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1438aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1439aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1440bda325fcSPaul Mullowney 
1441bda325fcSPaul Mullowney   PetscFunctionBegin;
1442aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1443aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
14449566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1445aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1446aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1447bda325fcSPaul Mullowney   }
1448bda325fcSPaul Mullowney 
1449bda325fcSPaul Mullowney   /* Get the GPU pointers */
14509566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
14519566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1452c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1453c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1454bda325fcSPaul Mullowney 
14559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1456aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1457a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1458c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1459c41cb2e2SAlejandro Lamas Daviña                xGPU);
1460aa372e3fSPaul Mullowney 
1461aa372e3fSPaul Mullowney   /* First, solve U */
1462aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1463afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14641b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1465afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1466afb2bd1cSJunchao Zhang                       #endif
1467afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1468aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1469aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1470aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1471aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1472d49cd2b7SBarry Smith                         xarray,
14731b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1474d49cd2b7SBarry Smith                         tempGPU->data().get(),
14759566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1476d49cd2b7SBarry Smith                       #else
14779566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1478afb2bd1cSJunchao Zhang                       #endif
1479aa372e3fSPaul Mullowney 
1480aa372e3fSPaul Mullowney   /* Then, solve L */
1481aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1482afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14831b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1485afb2bd1cSJunchao Zhang                       #endif
1486afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1487aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1488aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1489aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1490aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1491d49cd2b7SBarry Smith                         tempGPU->data().get(),
14921b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1493d49cd2b7SBarry Smith                         xarray,
14949566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1495d49cd2b7SBarry Smith                       #else
14969566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1497afb2bd1cSJunchao Zhang                       #endif
1498aa372e3fSPaul Mullowney 
1499aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1500a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1501c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1502aa372e3fSPaul Mullowney                tempGPU->begin());
1503aa372e3fSPaul Mullowney 
1504aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1505a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1506bda325fcSPaul Mullowney 
1507bda325fcSPaul Mullowney   /* restore */
15089566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
15099566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
15109566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15119566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1512bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1513bda325fcSPaul Mullowney }
1514bda325fcSPaul Mullowney 
15156fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1516bda325fcSPaul Mullowney {
1517465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1518465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1519bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1520bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1521aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1522aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1523aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1524bda325fcSPaul Mullowney 
1525bda325fcSPaul Mullowney   PetscFunctionBegin;
1526aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1527aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1529aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1530aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1531bda325fcSPaul Mullowney   }
1532bda325fcSPaul Mullowney 
1533bda325fcSPaul Mullowney   /* Get the GPU pointers */
15349566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
15359566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1536bda325fcSPaul Mullowney 
15379566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1538aa372e3fSPaul Mullowney   /* First, solve U */
1539aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1540afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15411b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1542afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1543afb2bd1cSJunchao Zhang                       #endif
1544afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1545aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1546aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1547aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1548aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1549d49cd2b7SBarry Smith                         barray,
15501b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1551d49cd2b7SBarry Smith                         tempGPU->data().get(),
15529566063dSJacob Faibussowitsch                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1553d49cd2b7SBarry Smith                       #else
15549566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1555afb2bd1cSJunchao Zhang                       #endif
1556aa372e3fSPaul Mullowney 
1557aa372e3fSPaul Mullowney   /* Then, solve L */
1558aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1559afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15601b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1561afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1562afb2bd1cSJunchao Zhang                       #endif
1563afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1564aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1565aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1566aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1567aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1568d49cd2b7SBarry Smith                         tempGPU->data().get(),
15691b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1570d49cd2b7SBarry Smith                         xarray,
15719566063dSJacob Faibussowitsch                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);PetscCallCUSPARSE(stat);
1572d49cd2b7SBarry Smith                       #else
15739566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1574afb2bd1cSJunchao Zhang                       #endif
1575bda325fcSPaul Mullowney 
1576bda325fcSPaul Mullowney   /* restore */
15779566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
15789566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
15799566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
15809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1581bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1582bda325fcSPaul Mullowney }
1583bda325fcSPaul Mullowney 
15846fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15859ae82921SPaul Mullowney {
1586465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1587465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1588465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1589465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15909ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15919ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
15959ae82921SPaul Mullowney 
15969ae82921SPaul Mullowney   PetscFunctionBegin;
1597ebc8f436SDominic Meiser 
1598e057df02SPaul Mullowney   /* Get the GPU pointers */
15999566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
16009566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
1601c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1602c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16039ae82921SPaul Mullowney 
16049566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1605aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1606a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1607c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16084e4bbfaaSStefano Zampini                tempGPU->begin());
1609aa372e3fSPaul Mullowney 
1610aa372e3fSPaul Mullowney   /* Next, solve L */
1611aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1612afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16131b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1615afb2bd1cSJunchao Zhang                       #endif
1616afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1617aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1618aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1619aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1620aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1621d49cd2b7SBarry Smith                         tempGPU->data().get(),
16221b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623d49cd2b7SBarry Smith                          xarray,
16249566063dSJacob Faibussowitsch                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1625d49cd2b7SBarry Smith                       #else
16269566063dSJacob Faibussowitsch                          xarray);PetscCallCUSPARSE(stat);
1627afb2bd1cSJunchao Zhang                       #endif
1628aa372e3fSPaul Mullowney 
1629aa372e3fSPaul Mullowney   /* Then, solve U */
1630aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1631afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16321b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1633afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1634afb2bd1cSJunchao Zhang                       #endif
1635afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1636aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1637aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1638aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1639d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
16401b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1641d49cd2b7SBarry Smith                         tempGPU->data().get(),
16429566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1643d49cd2b7SBarry Smith                       #else
16449566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1645afb2bd1cSJunchao Zhang                       #endif
1646d49cd2b7SBarry Smith 
16474e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1648a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16494e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16504e4bbfaaSStefano Zampini                xGPU);
16519ae82921SPaul Mullowney 
16529566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
16539566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
16549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
16559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
16569ae82921SPaul Mullowney   PetscFunctionReturn(0);
16579ae82921SPaul Mullowney }
16589ae82921SPaul Mullowney 
16596fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16609ae82921SPaul Mullowney {
1661465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1662465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16639ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1665aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1666aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1667aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
16689ae82921SPaul Mullowney 
16699ae82921SPaul Mullowney   PetscFunctionBegin;
1670e057df02SPaul Mullowney   /* Get the GPU pointers */
16719566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayWrite(xx,&xarray));
16729566063dSJacob Faibussowitsch   PetscCall(VecCUDAGetArrayRead(bb,&barray));
16739ae82921SPaul Mullowney 
16749566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1675aa372e3fSPaul Mullowney   /* First, solve L */
1676aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1677afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16781b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1680afb2bd1cSJunchao Zhang                       #endif
1681afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1682aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1683aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1684aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1685aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1686d49cd2b7SBarry Smith                         barray,
16871b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1688d49cd2b7SBarry Smith                         tempGPU->data().get(),
16899566063dSJacob Faibussowitsch                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1690d49cd2b7SBarry Smith                       #else
16919566063dSJacob Faibussowitsch                         tempGPU->data().get());PetscCallCUSPARSE(stat);
1692afb2bd1cSJunchao Zhang                       #endif
1693d49cd2b7SBarry Smith 
1694aa372e3fSPaul Mullowney   /* Next, solve U */
1695aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1696afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16971b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1699afb2bd1cSJunchao Zhang                       #endif
1700afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1701aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1702aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1703aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1704aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1705d49cd2b7SBarry Smith                         tempGPU->data().get(),
17061b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1707d49cd2b7SBarry Smith                         xarray,
17089566063dSJacob Faibussowitsch                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);PetscCallCUSPARSE(stat);
1709d49cd2b7SBarry Smith                       #else
17109566063dSJacob Faibussowitsch                         xarray);PetscCallCUSPARSE(stat);
1711afb2bd1cSJunchao Zhang                       #endif
17129ae82921SPaul Mullowney 
17139566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayRead(bb,&barray));
17149566063dSJacob Faibussowitsch   PetscCall(VecCUDARestoreArrayWrite(xx,&xarray));
17159566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
17169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
17179ae82921SPaul Mullowney   PetscFunctionReturn(0);
17189ae82921SPaul Mullowney }
17199ae82921SPaul Mullowney 
17207e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17217e8381f9SStefano Zampini {
17227e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17237e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17247e8381f9SStefano Zampini 
17257e8381f9SStefano Zampini   PetscFunctionBegin;
17267e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17277e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17287e8381f9SStefano Zampini 
17299566063dSJacob Faibussowitsch     PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
17319566063dSJacob Faibussowitsch     PetscCallCUDA(WaitForCUDA());
17329566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
17339566063dSJacob Faibussowitsch     PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17347e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17357e8381f9SStefano Zampini   }
17367e8381f9SStefano Zampini   PetscFunctionReturn(0);
17377e8381f9SStefano Zampini }
17387e8381f9SStefano Zampini 
17397e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17407e8381f9SStefano Zampini {
17417e8381f9SStefano Zampini   PetscFunctionBegin;
17429566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
174367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
174467a45760SJunchao Zhang   PetscFunctionReturn(0);
174567a45760SJunchao Zhang }
174667a45760SJunchao Zhang 
174767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
174867a45760SJunchao Zhang {
174967a45760SJunchao Zhang   PetscFunctionBegin;
17507e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
175167a45760SJunchao Zhang   *array         = NULL;
175267a45760SJunchao Zhang   PetscFunctionReturn(0);
175367a45760SJunchao Zhang }
175467a45760SJunchao Zhang 
175567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
175667a45760SJunchao Zhang {
175767a45760SJunchao Zhang   PetscFunctionBegin;
17589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
175967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
176067a45760SJunchao Zhang   PetscFunctionReturn(0);
176167a45760SJunchao Zhang }
176267a45760SJunchao Zhang 
176367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
176467a45760SJunchao Zhang {
176567a45760SJunchao Zhang   PetscFunctionBegin;
176667a45760SJunchao Zhang   *array = NULL;
176767a45760SJunchao Zhang   PetscFunctionReturn(0);
176867a45760SJunchao Zhang }
176967a45760SJunchao Zhang 
177067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
177167a45760SJunchao Zhang {
177267a45760SJunchao Zhang   PetscFunctionBegin;
177367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
177467a45760SJunchao Zhang   PetscFunctionReturn(0);
177567a45760SJunchao Zhang }
177667a45760SJunchao Zhang 
177767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
177867a45760SJunchao Zhang {
177967a45760SJunchao Zhang   PetscFunctionBegin;
178067a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
178167a45760SJunchao Zhang   *array         = NULL;
17827e8381f9SStefano Zampini   PetscFunctionReturn(0);
17837e8381f9SStefano Zampini }
17847e8381f9SStefano Zampini 
17857ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype)
17867ee59b9bSJunchao Zhang {
17877ee59b9bSJunchao Zhang   Mat_SeqAIJCUSPARSE           *cusp;
17887ee59b9bSJunchao Zhang   CsrMatrix                    *matrix;
17897ee59b9bSJunchao Zhang 
17907ee59b9bSJunchao Zhang   PetscFunctionBegin;
17917ee59b9bSJunchao Zhang   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
17927ee59b9bSJunchao Zhang   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
17937ee59b9bSJunchao Zhang   cusp = static_cast<Mat_SeqAIJCUSPARSE*>(A->spptr);
17947ee59b9bSJunchao Zhang   PetscCheck(cusp != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"cusp is NULL");
17957ee59b9bSJunchao Zhang   matrix = (CsrMatrix*)cusp->mat->mat;
17967ee59b9bSJunchao Zhang 
17977ee59b9bSJunchao Zhang   if (i) {
17987ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
17997ee59b9bSJunchao Zhang     *i = matrix->row_offsets->data().get();
18007ee59b9bSJunchao Zhang    #else
18017ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
18027ee59b9bSJunchao Zhang    #endif
18037ee59b9bSJunchao Zhang   }
18047ee59b9bSJunchao Zhang   if (j) {
18057ee59b9bSJunchao Zhang    #if !defined(PETSC_USE_64BIT_INDICES)
18067ee59b9bSJunchao Zhang     *j = matrix->column_indices->data().get();
18077ee59b9bSJunchao Zhang    #else
18087ee59b9bSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSparse does not supported 64-bit indices");
18097ee59b9bSJunchao Zhang    #endif
18107ee59b9bSJunchao Zhang   }
18117ee59b9bSJunchao Zhang   if (a) *a = matrix->values->data().get();
18127ee59b9bSJunchao Zhang   if (mtype) *mtype = PETSC_MEMTYPE_CUDA;
18137ee59b9bSJunchao Zhang   PetscFunctionReturn(0);
18147ee59b9bSJunchao Zhang }
18157ee59b9bSJunchao Zhang 
1816042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18179ae82921SPaul Mullowney {
1818aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18197c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18209ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1821213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1822aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1823abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
18249ae82921SPaul Mullowney 
18259ae82921SPaul Mullowney   PetscFunctionBegin;
182628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1827c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1828a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1829a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1830afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
183185ba7357SStefano Zampini 
183208401ef6SPierre Jolivet       PetscCheck(!a->nz || a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
18339566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1834afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
18359566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
18369566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
18379566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
18389566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
183934d6c7a5SJose E. Roman     } else {
1840abb89eb1SStefano Zampini       PetscInt nnz;
18419566063dSJacob Faibussowitsch       PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
18429566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
18439566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
18447c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
184581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1846a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1847a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18489ae82921SPaul Mullowney       try {
18499ae82921SPaul Mullowney         if (a->compressedrow.use) {
18509ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
18519ae82921SPaul Mullowney           ii   = a->compressedrow.i;
18529ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
18539ae82921SPaul Mullowney         } else {
1854213423ffSJunchao Zhang           m    = A->rmap->n;
1855213423ffSJunchao Zhang           ii   = a->i;
1856e6e9a74fSStefano Zampini           ridx = NULL;
18579ae82921SPaul Mullowney         }
185808401ef6SPierre Jolivet         PetscCheck(ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1859abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1860abb89eb1SStefano Zampini         else nnz = a->nz;
186108401ef6SPierre Jolivet         PetscCheck(!nnz || a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
18629ae82921SPaul Mullowney 
186385ba7357SStefano Zampini         /* create cusparse matrix */
1864abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1865aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
18669566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
18679566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
18689566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
18699ae82921SPaul Mullowney 
18709566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
18719566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
18729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
18739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
18759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1877b06137fdSPaul Mullowney 
1878aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1879aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1880aa372e3fSPaul Mullowney           /* set the matrix */
1881afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1882afb2bd1cSJunchao Zhang           mat->num_rows = m;
1883afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1884abb89eb1SStefano Zampini           mat->num_entries = nnz;
1885afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1886afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18879ae82921SPaul Mullowney 
1888abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1889abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1890aa372e3fSPaul Mullowney 
1891abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1892abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1893aa372e3fSPaul Mullowney 
1894aa372e3fSPaul Mullowney           /* assign the pointer */
1895afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1896afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1897afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1898afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1899afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1900afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1901afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1902afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
19039566063dSJacob Faibussowitsch                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
1904afb2bd1cSJunchao Zhang           }
1905afb2bd1cSJunchao Zhang          #endif
1906aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1907afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1908afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1909afb2bd1cSJunchao Zhang          #else
1910afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1911afb2bd1cSJunchao Zhang           mat->num_rows = m;
1912afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1913abb89eb1SStefano Zampini           mat->num_entries = nnz;
1914afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1915afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1916aa372e3fSPaul Mullowney 
1917abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1918abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1919aa372e3fSPaul Mullowney 
1920abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1921abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1922aa372e3fSPaul Mullowney 
1923aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
19249566063dSJacob Faibussowitsch           PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat));
1925aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1926aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1927afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1928afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1929afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1930afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
19319566063dSJacob Faibussowitsch               hybMat, 0, partition);PetscCallCUSPARSE(stat);
1932aa372e3fSPaul Mullowney           /* assign the pointer */
1933aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1934aa372e3fSPaul Mullowney 
1935afb2bd1cSJunchao Zhang           if (mat) {
1936afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1937afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1938afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1939afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1940087f3262SPaul Mullowney           }
1941afb2bd1cSJunchao Zhang          #endif
1942087f3262SPaul Mullowney         }
1943ca45077fSPaul Mullowney 
1944aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1945213423ffSJunchao Zhang         if (a->compressedrow.use) {
1946213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1947aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1948aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1949213423ffSJunchao Zhang           tmp = m;
1950213423ffSJunchao Zhang         } else {
1951213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1952213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1953213423ffSJunchao Zhang           tmp = 0;
1954213423ffSJunchao Zhang         }
19559566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1956aa372e3fSPaul Mullowney 
1957aa372e3fSPaul Mullowney         /* assign the pointer */
1958aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
19599ae82921SPaul Mullowney       } catch(char *ex) {
196098921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
19619ae82921SPaul Mullowney       }
19629566063dSJacob Faibussowitsch       PetscCallCUDA(WaitForCUDA());
19639566063dSJacob Faibussowitsch       PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
196434d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
196534d6c7a5SJose E. Roman     }
1966abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
19679ae82921SPaul Mullowney   }
19689ae82921SPaul Mullowney   PetscFunctionReturn(0);
19699ae82921SPaul Mullowney }
19709ae82921SPaul Mullowney 
1971c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1972aa372e3fSPaul Mullowney {
1973aa372e3fSPaul Mullowney   template <typename Tuple>
1974aa372e3fSPaul Mullowney   __host__ __device__
1975aa372e3fSPaul Mullowney   void operator()(Tuple t)
1976aa372e3fSPaul Mullowney   {
1977aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1978aa372e3fSPaul Mullowney   }
1979aa372e3fSPaul Mullowney };
1980aa372e3fSPaul Mullowney 
19817e8381f9SStefano Zampini struct VecCUDAEquals
19827e8381f9SStefano Zampini {
19837e8381f9SStefano Zampini   template <typename Tuple>
19847e8381f9SStefano Zampini   __host__ __device__
19857e8381f9SStefano Zampini   void operator()(Tuple t)
19867e8381f9SStefano Zampini   {
19877e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19887e8381f9SStefano Zampini   }
19897e8381f9SStefano Zampini };
19907e8381f9SStefano Zampini 
1991e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1992e6e9a74fSStefano Zampini {
1993e6e9a74fSStefano Zampini   template <typename Tuple>
1994e6e9a74fSStefano Zampini   __host__ __device__
1995e6e9a74fSStefano Zampini   void operator()(Tuple t)
1996e6e9a74fSStefano Zampini   {
1997e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1998e6e9a74fSStefano Zampini   }
1999e6e9a74fSStefano Zampini };
2000e6e9a74fSStefano Zampini 
2001afb2bd1cSJunchao Zhang struct MatMatCusparse {
2002ccdfe979SStefano Zampini   PetscBool             cisdense;
2003ccdfe979SStefano Zampini   PetscScalar           *Bt;
2004ccdfe979SStefano Zampini   Mat                   X;
2005fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2006fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2007fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2008b4285af6SJunchao Zhang 
2009afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2010fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2011afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2012afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2013afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2014afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2015b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2016b4285af6SJunchao Zhang   void                  *dBuffer4;
2017b4285af6SJunchao Zhang   void                  *dBuffer5;
2018b4285af6SJunchao Zhang  #endif
2019fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2020fcdce8c4SStefano Zampini   void                  *mmBuffer;
2021fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2022fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2023afb2bd1cSJunchao Zhang #endif
2024afb2bd1cSJunchao Zhang };
2025ccdfe979SStefano Zampini 
2026ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2027ccdfe979SStefano Zampini {
2028ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2029ccdfe979SStefano Zampini 
2030ccdfe979SStefano Zampini   PetscFunctionBegin;
20319566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(mmdata->Bt));
2032fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2033afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20349566063dSJacob Faibussowitsch   if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
20359566063dSJacob Faibussowitsch   if (mmdata->matBDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
20369566063dSJacob Faibussowitsch   if (mmdata->matCDescr)   PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
20379566063dSJacob Faibussowitsch   if (mmdata->spgemmDesc)  PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2038b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
20399566063dSJacob Faibussowitsch   if (mmdata->dBuffer4)  PetscCallCUDA(cudaFree(mmdata->dBuffer4));
20409566063dSJacob Faibussowitsch   if (mmdata->dBuffer5)  PetscCallCUDA(cudaFree(mmdata->dBuffer5));
2041b4285af6SJunchao Zhang  #endif
20429566063dSJacob Faibussowitsch   if (mmdata->mmBuffer)  PetscCallCUDA(cudaFree(mmdata->mmBuffer));
20439566063dSJacob Faibussowitsch   if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2));
2044afb2bd1cSJunchao Zhang  #endif
20459566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&mmdata->X));
20469566063dSJacob Faibussowitsch   PetscCall(PetscFree(data));
2047ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2048ccdfe979SStefano Zampini }
2049ccdfe979SStefano Zampini 
2050ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2051ccdfe979SStefano Zampini 
2052ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2053ccdfe979SStefano Zampini {
2054ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2055ccdfe979SStefano Zampini   Mat                          A,B;
2056afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2057ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2058ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2059ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2060ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2061ccdfe979SStefano Zampini   const PetscScalar            *barray;
2062ccdfe979SStefano Zampini   PetscScalar                  *carray;
2063ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2064ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2065ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2066ccdfe979SStefano Zampini 
2067ccdfe979SStefano Zampini   PetscFunctionBegin;
2068ccdfe979SStefano Zampini   MatCheckProduct(C,1);
206928b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2070ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2071ccdfe979SStefano Zampini   A    = product->A;
2072ccdfe979SStefano Zampini   B    = product->B;
20739566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
207428b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2075ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2076ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
207728b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
20789566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
2079ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2080ccdfe979SStefano Zampini   switch (product->type) {
2081ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2082ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2083ccdfe979SStefano Zampini     mat = cusp->mat;
2084ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2085ccdfe979SStefano Zampini     m   = A->rmap->n;
2086ccdfe979SStefano Zampini     n   = B->cmap->n;
2087ccdfe979SStefano Zampini     break;
2088ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20891a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2090e6e9a74fSStefano Zampini       mat = cusp->mat;
2091e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2092e6e9a74fSStefano Zampini     } else {
20939566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2094ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2095ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2096e6e9a74fSStefano Zampini     }
2097ccdfe979SStefano Zampini     m = A->cmap->n;
2098ccdfe979SStefano Zampini     n = B->cmap->n;
2099ccdfe979SStefano Zampini     break;
2100ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2101ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2102ccdfe979SStefano Zampini     mat = cusp->mat;
2103ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2104ccdfe979SStefano Zampini     m   = A->rmap->n;
2105ccdfe979SStefano Zampini     n   = B->rmap->n;
2106ccdfe979SStefano Zampini     break;
2107ccdfe979SStefano Zampini   default:
210898921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2109ccdfe979SStefano Zampini   }
211028b400f6SJacob Faibussowitsch   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2111ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2112ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
21139566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
21149566063dSJacob Faibussowitsch   if (!biscuda) PetscCall(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
21159566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDAGetArrayRead(B,&barray));
2116afb2bd1cSJunchao Zhang 
21179566063dSJacob Faibussowitsch   PetscCall(MatDenseGetLDA(B,&blda));
2118c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
21199566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
21209566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(mmdata->X,&clda));
2121c8378d12SStefano Zampini   } else {
21229566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDAGetArrayWrite(C,&carray));
21239566063dSJacob Faibussowitsch     PetscCall(MatDenseGetLDA(C,&clda));
2124c8378d12SStefano Zampini   }
2125c8378d12SStefano Zampini 
21269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2127afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2128afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2129a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2130afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2131fcdce8c4SStefano Zampini     size_t mmBufferSize;
21329566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Blda != blda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2133afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
21349566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2135afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2136afb2bd1cSJunchao Zhang     }
2137c8378d12SStefano Zampini 
21389566063dSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Clda != clda) {PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2139afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
21409566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2141afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2142afb2bd1cSJunchao Zhang     }
2143afb2bd1cSJunchao Zhang 
2144afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2145afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2146afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2147afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2148afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2149afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
21509566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);PetscCallCUSPARSE(stat);
2151afb2bd1cSJunchao Zhang     }
2152afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2153afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2154afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
21559566063dSJacob Faibussowitsch                                    cusp->spmmAlg,&mmBufferSize);PetscCallCUSPARSE(stat);
2156fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
21579566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(mmdata->mmBuffer));
21589566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2159fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2160fcdce8c4SStefano Zampini     }
2161afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2162afb2bd1cSJunchao Zhang   } else {
2163afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
21649566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
21659566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
21669566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2167afb2bd1cSJunchao Zhang   }
2168afb2bd1cSJunchao Zhang 
2169afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2170afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2171afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2172afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
21739566063dSJacob Faibussowitsch                       cusp->spmmAlg,mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2174afb2bd1cSJunchao Zhang  #else
2175afb2bd1cSJunchao Zhang   PetscInt k;
2176afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2177ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2178ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2179ccdfe979SStefano Zampini     cublasStatus_t cerr;
2180ccdfe979SStefano Zampini 
21819566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
2182ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2183ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2184ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2185ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
21869566063dSJacob Faibussowitsch                        mmdata->Bt,B->cmap->n);PetscCallCUBLAS(cerr);
2187ccdfe979SStefano Zampini     blda = B->cmap->n;
2188afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2189afb2bd1cSJunchao Zhang   } else {
2190afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2191ccdfe979SStefano Zampini   }
2192ccdfe979SStefano Zampini 
2193afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2194ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2195afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2196ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2197ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2198ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2199ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
22009566063dSJacob Faibussowitsch                            carray,clda);PetscCallCUSPARSE(stat);
2201afb2bd1cSJunchao Zhang  #endif
22029566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
22039566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
22049566063dSJacob Faibussowitsch   PetscCall(MatDenseCUDARestoreArrayRead(B,&barray));
2205ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
22069566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22079566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2208ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
22099566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22109566063dSJacob Faibussowitsch     PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2211ccdfe979SStefano Zampini   } else {
22129566063dSJacob Faibussowitsch     PetscCall(MatDenseCUDARestoreArrayWrite(C,&carray));
2213ccdfe979SStefano Zampini   }
2214ccdfe979SStefano Zampini   if (mmdata->cisdense) {
22159566063dSJacob Faibussowitsch     PetscCall(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2216ccdfe979SStefano Zampini   }
2217ccdfe979SStefano Zampini   if (!biscuda) {
22189566063dSJacob Faibussowitsch     PetscCall(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2219ccdfe979SStefano Zampini   }
2220ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2221ccdfe979SStefano Zampini }
2222ccdfe979SStefano Zampini 
2223ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2224ccdfe979SStefano Zampini {
2225ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2226ccdfe979SStefano Zampini   Mat                A,B;
2227ccdfe979SStefano Zampini   PetscInt           m,n;
2228ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2229ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2230ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2231ccdfe979SStefano Zampini 
2232ccdfe979SStefano Zampini   PetscFunctionBegin;
2233ccdfe979SStefano Zampini   MatCheckProduct(C,1);
223428b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2235ccdfe979SStefano Zampini   A    = product->A;
2236ccdfe979SStefano Zampini   B    = product->B;
22379566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
223828b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2239ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
224008401ef6SPierre Jolivet   PetscCheck(cusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2241ccdfe979SStefano Zampini   switch (product->type) {
2242ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2243ccdfe979SStefano Zampini     m = A->rmap->n;
2244ccdfe979SStefano Zampini     n = B->cmap->n;
2245ccdfe979SStefano Zampini     break;
2246ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2247ccdfe979SStefano Zampini     m = A->cmap->n;
2248ccdfe979SStefano Zampini     n = B->cmap->n;
2249ccdfe979SStefano Zampini     break;
2250ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2251ccdfe979SStefano Zampini     m = A->rmap->n;
2252ccdfe979SStefano Zampini     n = B->rmap->n;
2253ccdfe979SStefano Zampini     break;
2254ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2255ccdfe979SStefano Zampini     m = B->cmap->n;
2256ccdfe979SStefano Zampini     n = B->cmap->n;
2257ccdfe979SStefano Zampini     break;
2258ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2259ccdfe979SStefano Zampini     m = B->rmap->n;
2260ccdfe979SStefano Zampini     n = B->rmap->n;
2261ccdfe979SStefano Zampini     break;
2262ccdfe979SStefano Zampini   default:
226398921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2264ccdfe979SStefano Zampini   }
22659566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
2266ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
22679566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
22689566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQDENSECUDA));
2269ccdfe979SStefano Zampini 
2270ccdfe979SStefano Zampini   /* product data */
22719566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2272ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2273afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2274afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2275ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
22769566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2277ccdfe979SStefano Zampini   }
2278afb2bd1cSJunchao Zhang  #endif
2279ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2280ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
22819566063dSJacob Faibussowitsch     PetscCall(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
22829566063dSJacob Faibussowitsch     PetscCall(MatSetType(mmdata->X,MATSEQDENSECUDA));
2283ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
22849566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2285ccdfe979SStefano Zampini     } else {
22869566063dSJacob Faibussowitsch       PetscCall(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2287ccdfe979SStefano Zampini     }
2288ccdfe979SStefano Zampini   }
2289ccdfe979SStefano Zampini   C->product->data    = mmdata;
2290ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2291ccdfe979SStefano Zampini 
2292ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2293ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2294ccdfe979SStefano Zampini }
2295ccdfe979SStefano Zampini 
2296fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2297ccdfe979SStefano Zampini {
2298ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2299fcdce8c4SStefano Zampini   Mat                          A,B;
2300fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2301fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2302fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2303fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2304fcdce8c4SStefano Zampini   PetscBool                    flg;
2305fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2306fcdce8c4SStefano Zampini   MatProductType               ptype;
2307fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2308fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2309fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2310fcdce8c4SStefano Zampini #endif
2311b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2312ccdfe979SStefano Zampini 
2313ccdfe979SStefano Zampini   PetscFunctionBegin;
2314ccdfe979SStefano Zampini   MatCheckProduct(C,1);
231528b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
23169566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
231728b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2318fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2319fcdce8c4SStefano Zampini   A = product->A;
2320fcdce8c4SStefano Zampini   B = product->B;
2321fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2322fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2323fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
232408401ef6SPierre Jolivet     PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2325fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
232628b400f6SJacob Faibussowitsch     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2327fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
232828b400f6SJacob Faibussowitsch     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2329fcdce8c4SStefano Zampini     goto finalize;
2330fcdce8c4SStefano Zampini   }
2331fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
23329566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
233328b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
23349566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
233528b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
233628b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
233728b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2338fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2339fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2340fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
234108401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
234208401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
234308401ef6SPierre Jolivet   PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
23449566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
23459566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2346fcdce8c4SStefano Zampini 
2347fcdce8c4SStefano Zampini   ptype = product->type;
2348fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2349fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
235028b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2351fa046f9fSJunchao Zhang   }
2352fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2353fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
235428b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2355fa046f9fSJunchao Zhang   }
2356fcdce8c4SStefano Zampini   switch (ptype) {
2357fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2358fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2359fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2360fcdce8c4SStefano Zampini     break;
2361fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2362fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2363fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2364fcdce8c4SStefano Zampini     break;
2365fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2366fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2367fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2368fcdce8c4SStefano Zampini     break;
2369fcdce8c4SStefano Zampini   default:
237098921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2371fcdce8c4SStefano Zampini   }
2372fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
237328b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
237428b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
237528b400f6SJacob Faibussowitsch   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2376fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2377fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2378fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
237928b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
238028b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
238128b400f6SJacob Faibussowitsch   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
23829566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2383fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2384fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
23859566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2386b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2387b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2388b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2389b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
23909566063dSJacob Faibussowitsch                                mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2391b4285af6SJunchao Zhang   #else
2392b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2393fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2394fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
23959566063dSJacob Faibussowitsch                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2396b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2397fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
23989566063dSJacob Faibussowitsch                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2399b4285af6SJunchao Zhang   #endif
2400fcdce8c4SStefano Zampini #else
2401b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2402fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2403fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2404fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
24059566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2406fcdce8c4SStefano Zampini #endif
24079566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
24089566063dSJacob Faibussowitsch   PetscCallCUDA(WaitForCUDA());
24099566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2410fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2411fcdce8c4SStefano Zampini finalize:
2412fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
24139566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
24149566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
24159566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2416fcdce8c4SStefano Zampini   c->reallocs         = 0;
2417fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2418fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2419fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2420fcdce8c4SStefano Zampini   C->num_ass++;
2421ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2422ccdfe979SStefano Zampini }
2423fcdce8c4SStefano Zampini 
2424fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2425fcdce8c4SStefano Zampini {
2426fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2427fcdce8c4SStefano Zampini   Mat                          A,B;
2428fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2429fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2430fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2431fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2432fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2433fcdce8c4SStefano Zampini   PetscBool                    flg;
2434fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2435fcdce8c4SStefano Zampini   MatProductType               ptype;
2436fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2437fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2438fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2439fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2440fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2441fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2442fcdce8c4SStefano Zampini #else
2443fcdce8c4SStefano Zampini   int                          cnz;
2444fcdce8c4SStefano Zampini #endif
2445b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2446fcdce8c4SStefano Zampini 
2447fcdce8c4SStefano Zampini   PetscFunctionBegin;
2448fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
244928b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2450fcdce8c4SStefano Zampini   A    = product->A;
2451fcdce8c4SStefano Zampini   B    = product->B;
24529566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
245328b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
24549566063dSJacob Faibussowitsch   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
245528b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2456fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2457fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2458fcdce8c4SStefano Zampini   /* product data */
24599566063dSJacob Faibussowitsch   PetscCall(PetscNew(&mmdata));
2460fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2461fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2462fcdce8c4SStefano Zampini 
24639566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
24649566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
2465d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2466d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
246708401ef6SPierre Jolivet   PetscCheck(Acusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
246808401ef6SPierre Jolivet   PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2469d60bce21SJunchao Zhang 
2470fcdce8c4SStefano Zampini   ptype = product->type;
2471fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2472fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2473fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2474fa046f9fSJunchao Zhang   }
2475fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2476fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2477fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2478fa046f9fSJunchao Zhang   }
2479fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2480fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2481fcdce8c4SStefano Zampini   switch (ptype) {
2482fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2483fcdce8c4SStefano Zampini     m = A->rmap->n;
2484fcdce8c4SStefano Zampini     n = B->cmap->n;
2485fcdce8c4SStefano Zampini     k = A->cmap->n;
2486fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2487fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2488fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2489fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2490fcdce8c4SStefano Zampini     break;
2491fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2492fcdce8c4SStefano Zampini     m = A->cmap->n;
2493fcdce8c4SStefano Zampini     n = B->cmap->n;
2494fcdce8c4SStefano Zampini     k = A->rmap->n;
24959566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2496fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2497fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2498fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2499fcdce8c4SStefano Zampini     break;
2500fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2501fcdce8c4SStefano Zampini     m = A->rmap->n;
2502fcdce8c4SStefano Zampini     n = B->rmap->n;
2503fcdce8c4SStefano Zampini     k = A->cmap->n;
25049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2505fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2506fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2507fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2508fcdce8c4SStefano Zampini     break;
2509fcdce8c4SStefano Zampini   default:
251098921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2511fcdce8c4SStefano Zampini   }
2512fcdce8c4SStefano Zampini 
2513fcdce8c4SStefano Zampini   /* create cusparse matrix */
25149566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(C,m,n,m,n));
25159566063dSJacob Faibussowitsch   PetscCall(MatSetType(C,MATSEQAIJCUSPARSE));
2516fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2517fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2518fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2519fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2520fcdce8c4SStefano Zampini 
2521fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2522fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2523fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
25249566063dSJacob Faibussowitsch     PetscCall(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
25259566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2526fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2527fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2528fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2529fcdce8c4SStefano Zampini   } else {
2530fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2531fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2532fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2533fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2534fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2535fcdce8c4SStefano Zampini   }
2536fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2537fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2538fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2539fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2540fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2541fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
25429566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
25439566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
25449566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
25459566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
25469566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
25479566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
25489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
25499566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
25509566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2551fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2552fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2553fcdce8c4SStefano Zampini     c->nz = 0;
2554fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2555fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2556fcdce8c4SStefano Zampini     goto finalizesym;
2557fcdce8c4SStefano Zampini   }
2558fcdce8c4SStefano Zampini 
255928b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
256028b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2561fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2562fcdce8c4SStefano Zampini   if (!biscompressed) {
2563fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2564fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2565fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2566fcdce8c4SStefano Zampini #endif
2567fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2568fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2569fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2570fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2571fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2572fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2573fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2574fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2575fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2576fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2577fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
25789566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2579fcdce8c4SStefano Zampini     }
2580fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2581fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2582fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2583fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2584fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2585fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2586fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2587fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
25889566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
2589fcdce8c4SStefano Zampini     }
2590fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2591fcdce8c4SStefano Zampini #endif
2592fcdce8c4SStefano Zampini   }
259328b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
259428b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2595fcdce8c4SStefano Zampini   /* precompute flops count */
2596fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2597fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2598fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2599fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2600fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2601fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2602fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2603fcdce8c4SStefano Zampini       }
2604fcdce8c4SStefano Zampini     }
2605fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2606fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2607fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2608fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2609fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2610fcdce8c4SStefano Zampini     }
2611fcdce8c4SStefano Zampini   } else { /* TODO */
2612fcdce8c4SStefano Zampini     flops = 0.;
2613fcdce8c4SStefano Zampini   }
2614fcdce8c4SStefano Zampini 
2615fcdce8c4SStefano Zampini   mmdata->flops = flops;
26169566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
2617b4285af6SJunchao Zhang 
2618fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
26199566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2620fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2621fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2622fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
26239566063dSJacob Faibussowitsch                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
26249566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2625b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2626b4285af6SJunchao Zhang  {
2627b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2628b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2629b4285af6SJunchao Zhang   */
2630b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2631b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2632b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2633b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2634b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2635b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2636b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2637b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2638b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2639b4285af6SJunchao Zhang 
2640b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2641b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2642b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2643b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26449566063dSJacob Faibussowitsch                                             &bufferSize1, NULL);PetscCallCUSPARSE(stat);
26459566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2646b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2647b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2648b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26499566063dSJacob Faibussowitsch                                             &bufferSize1, dBuffer1);PetscCallCUSPARSE(stat);
2650b4285af6SJunchao Zhang 
2651b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2652b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2653b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26549566063dSJacob Faibussowitsch                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);PetscCallCUSPARSE(stat);
26559566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
26569566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
26579566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2658b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2659b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26609566063dSJacob Faibussowitsch                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);PetscCallCUSPARSE(stat);
26619566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer1));
26629566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer2));
2663b4285af6SJunchao Zhang 
2664b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2665b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
26669566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2667b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2668b4285af6SJunchao Zhang   /* allocate matrix C */
26699566063dSJacob Faibussowitsch   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
26709566063dSJacob Faibussowitsch   Ccsr->values         = new THRUSTARRAY(c->nz);PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2671b4285af6SJunchao Zhang   /* update matC with the new pointers */
2672b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
26739566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2674b4285af6SJunchao Zhang 
2675b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2676b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2677b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26789566063dSJacob Faibussowitsch                                   &bufferSize5, NULL);PetscCallCUSPARSE(stat);
26799566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2680b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2681b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
26829566063dSJacob Faibussowitsch                                   &bufferSize5, mmdata->dBuffer5);PetscCallCUSPARSE(stat);
26839566063dSJacob Faibussowitsch   PetscCallCUDA(cudaFree(dBuffer3));
2684b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2685b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2686b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
26879566063dSJacob Faibussowitsch                                      mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
26889566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2689b4285af6SJunchao Zhang  }
2690ae37ee31SJunchao Zhang  #else
2691b4285af6SJunchao Zhang   size_t bufSize2;
2692fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2693b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2694fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2695fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
26969566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, NULL);PetscCallCUSPARSE(stat);
26979566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2698fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2699b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2700fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2701fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27029566063dSJacob Faibussowitsch                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);PetscCallCUSPARSE(stat);
2703fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2704b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2705fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2706fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27079566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);PetscCallCUSPARSE(stat);
2708fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2709fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2710fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2711fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2712fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
27139566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2714fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2715b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2716fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2717fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
27189566063dSJacob Faibussowitsch                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);PetscCallCUSPARSE(stat);
2719fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
27209566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2721fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
27229566063dSJacob Faibussowitsch   PetscCall(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2723fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
27249566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2725fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
27269566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2727fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
27289566063dSJacob Faibussowitsch                                 Ccsr->values->data().get());PetscCallCUSPARSE(stat);
2729b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2730fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
27319566063dSJacob Faibussowitsch                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);PetscCallCUSPARSE(stat);
2732ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2733fcdce8c4SStefano Zampini #else
27349566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2735b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2736fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2737fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2738fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
27399566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);PetscCallCUSPARSE(stat);
2740fcdce8c4SStefano Zampini   c->nz = cnz;
2741fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
27429566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2743fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
27449566063dSJacob Faibussowitsch   PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2745fcdce8c4SStefano Zampini 
27469566063dSJacob Faibussowitsch   PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2747fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2748fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2749fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2750b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2751fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2752fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2753fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
27549566063dSJacob Faibussowitsch                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());PetscCallCUSPARSE(stat);
2755fcdce8c4SStefano Zampini #endif
27569566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(mmdata->flops));
27579566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
2758fcdce8c4SStefano Zampini finalizesym:
2759fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2760fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2761fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
27629566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m+1,&c->i));
27639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->j));
2764fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2765fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2766fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2767fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2768fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2769fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2770fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2773fcdce8c4SStefano Zampini   } else {
2774fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2775fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27769566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2778fcdce8c4SStefano Zampini   }
2779fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2780fcdce8c4SStefano Zampini     PetscInt r = 0;
2781fcdce8c4SStefano Zampini     c->i[0] = 0;
2782fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2783fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2784fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2785fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2786fcdce8c4SStefano Zampini     }
2787fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2788fcdce8c4SStefano Zampini   }
27899566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
27909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->ilen));
27919566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(m,&c->imax));
2792fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2793fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2794fcdce8c4SStefano Zampini   c->rmax = 0;
2795fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2796fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2797fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2798fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2799fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2800fcdce8c4SStefano Zampini   }
28019566063dSJacob Faibussowitsch   PetscCall(MatMarkDiagonal_SeqAIJ(C));
28029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(c->nz,&c->a));
2803fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2804fcdce8c4SStefano Zampini 
2805fcdce8c4SStefano Zampini   C->nonzerostate++;
28069566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->rmap));
28079566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(C->cmap));
2808fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2809fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2810fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2811fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2812fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2813abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2814fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2815fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2816fcdce8c4SStefano Zampini   }
2817fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2818fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2819fcdce8c4SStefano Zampini }
2820fcdce8c4SStefano Zampini 
2821fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2822fcdce8c4SStefano Zampini 
2823fcdce8c4SStefano Zampini /* handles sparse or dense B */
2824fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2825fcdce8c4SStefano Zampini {
2826fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2827fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2828fcdce8c4SStefano Zampini 
2829fcdce8c4SStefano Zampini   PetscFunctionBegin;
2830fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
28319566063dSJacob Faibussowitsch   PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2832abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
28339566063dSJacob Faibussowitsch     PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2834fcdce8c4SStefano Zampini   }
2835fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2836fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2837fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
28389566063dSJacob Faibussowitsch       PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2839fcdce8c4SStefano Zampini     }
2840fcdce8c4SStefano Zampini   }
284165e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
284265e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
284365e4b4d4SStefano Zampini     switch (product->type) {
284465e4b4d4SStefano Zampini     case MATPRODUCT_AB:
284565e4b4d4SStefano Zampini       if (product->api_user) {
2846*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");
28479566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2848*d0609cedSBarry Smith         PetscOptionsEnd();
284965e4b4d4SStefano Zampini       } else {
2850*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");
28519566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
2852*d0609cedSBarry Smith         PetscOptionsEnd();
285365e4b4d4SStefano Zampini       }
285465e4b4d4SStefano Zampini       break;
285565e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
285665e4b4d4SStefano Zampini       if (product->api_user) {
2857*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");
28589566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2859*d0609cedSBarry Smith         PetscOptionsEnd();
286065e4b4d4SStefano Zampini       } else {
2861*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");
28629566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
2863*d0609cedSBarry Smith         PetscOptionsEnd();
286465e4b4d4SStefano Zampini       }
286565e4b4d4SStefano Zampini       break;
286665e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
286765e4b4d4SStefano Zampini       if (product->api_user) {
2868*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");
28699566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2870*d0609cedSBarry Smith         PetscOptionsEnd();
287165e4b4d4SStefano Zampini       } else {
2872*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");
28739566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
2874*d0609cedSBarry Smith         PetscOptionsEnd();
287565e4b4d4SStefano Zampini       }
287665e4b4d4SStefano Zampini       break;
287765e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
287865e4b4d4SStefano Zampini       if (product->api_user) {
2879*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");
28809566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2881*d0609cedSBarry Smith         PetscOptionsEnd();
288265e4b4d4SStefano Zampini       } else {
2883*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");
28849566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
2885*d0609cedSBarry Smith         PetscOptionsEnd();
288665e4b4d4SStefano Zampini       }
288765e4b4d4SStefano Zampini       break;
288865e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
288965e4b4d4SStefano Zampini       if (product->api_user) {
2890*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");
28919566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2892*d0609cedSBarry Smith         PetscOptionsEnd();
289365e4b4d4SStefano Zampini       } else {
2894*d0609cedSBarry Smith         PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");
28959566063dSJacob Faibussowitsch         PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
2896*d0609cedSBarry Smith         PetscOptionsEnd();
289765e4b4d4SStefano Zampini       }
289865e4b4d4SStefano Zampini       break;
289965e4b4d4SStefano Zampini     default:
290065e4b4d4SStefano Zampini       break;
290165e4b4d4SStefano Zampini     }
290265e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
290365e4b4d4SStefano Zampini   }
290465e4b4d4SStefano Zampini   /* dispatch */
2905fcdce8c4SStefano Zampini   if (isdense) {
2906ccdfe979SStefano Zampini     switch (product->type) {
2907ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2908ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2909ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2910ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2911ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2912fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
29139566063dSJacob Faibussowitsch         PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2914fcdce8c4SStefano Zampini       } else {
2915fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2916fcdce8c4SStefano Zampini       }
2917fcdce8c4SStefano Zampini       break;
2918fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2919fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2920fcdce8c4SStefano Zampini       break;
2921ccdfe979SStefano Zampini     default:
2922ccdfe979SStefano Zampini       break;
2923ccdfe979SStefano Zampini     }
2924fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2925fcdce8c4SStefano Zampini     switch (product->type) {
2926fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2927fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2928fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2929fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2930fcdce8c4SStefano Zampini       break;
2931fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2932fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2933fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2934fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2935fcdce8c4SStefano Zampini       break;
2936fcdce8c4SStefano Zampini     default:
2937fcdce8c4SStefano Zampini       break;
2938fcdce8c4SStefano Zampini     }
2939fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
29409566063dSJacob Faibussowitsch     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
2941fcdce8c4SStefano Zampini   }
2942ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2943ccdfe979SStefano Zampini }
2944ccdfe979SStefano Zampini 
29456fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
29469ae82921SPaul Mullowney {
29479ae82921SPaul Mullowney   PetscFunctionBegin;
29489566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2949e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2950e6e9a74fSStefano Zampini }
2951e6e9a74fSStefano Zampini 
2952e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2953e6e9a74fSStefano Zampini {
2954e6e9a74fSStefano Zampini   PetscFunctionBegin;
29559566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2956e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2957e6e9a74fSStefano Zampini }
2958e6e9a74fSStefano Zampini 
2959e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2960e6e9a74fSStefano Zampini {
2961e6e9a74fSStefano Zampini   PetscFunctionBegin;
29629566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2963e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2964e6e9a74fSStefano Zampini }
2965e6e9a74fSStefano Zampini 
2966e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2967e6e9a74fSStefano Zampini {
2968e6e9a74fSStefano Zampini   PetscFunctionBegin;
29699566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
29709ae82921SPaul Mullowney   PetscFunctionReturn(0);
29719ae82921SPaul Mullowney }
29729ae82921SPaul Mullowney 
29736fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2974ca45077fSPaul Mullowney {
2975ca45077fSPaul Mullowney   PetscFunctionBegin;
29769566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2977ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2978ca45077fSPaul Mullowney }
2979ca45077fSPaul Mullowney 
2980a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2981a0e72f99SJunchao Zhang {
2982a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2983a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2984a0e72f99SJunchao Zhang }
2985a0e72f99SJunchao Zhang 
2986afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2987e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
29889ae82921SPaul Mullowney {
29899ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2990aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
29919ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2992e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2993e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2994e6e9a74fSStefano Zampini   PetscBool                    compressed;
2995afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2996afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2997afb2bd1cSJunchao Zhang #endif
29986e111a19SKarl Rupp 
29999ae82921SPaul Mullowney   PetscFunctionBegin;
300008401ef6SPierre Jolivet   PetscCheck(!herm || trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3001cbc6b225SStefano Zampini   if (!a->nz) {
30029566063dSJacob Faibussowitsch     if (!yy) PetscCall(VecSet_SeqCUDA(zz,0));
30039566063dSJacob Faibussowitsch     else PetscCall(VecCopy_SeqCUDA(yy,zz));
3004e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3005e6e9a74fSStefano Zampini   }
300634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
30079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
3008e6e9a74fSStefano Zampini   if (!trans) {
30099ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
30105f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3011e6e9a74fSStefano Zampini   } else {
30121a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3013e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3014e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3015e6e9a74fSStefano Zampini     } else {
30169566063dSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3017e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3018e6e9a74fSStefano Zampini     }
3019e6e9a74fSStefano Zampini   }
3020e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3021e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3022213423ffSJunchao Zhang 
3023e6e9a74fSStefano Zampini   try {
30249566063dSJacob Faibussowitsch     PetscCall(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
30259566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
30269566063dSJacob Faibussowitsch     else PetscCall(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3027afb2bd1cSJunchao Zhang 
30289566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
3029e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3030afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3031afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3032afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3033afb2bd1cSJunchao Zhang       */
3034e6e9a74fSStefano Zampini       xptr = xarray;
3035afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3036213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3037afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3038afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3039afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3040afb2bd1cSJunchao Zhang        */
3041afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3042afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3043afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3044afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3045afb2bd1cSJunchao Zhang       }
3046afb2bd1cSJunchao Zhang      #endif
3047e6e9a74fSStefano Zampini     } else {
3048afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3049afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3050afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3051afb2bd1cSJunchao Zhang        */
3052afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3053e6e9a74fSStefano Zampini       dptr = zarray;
3054e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3055afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3056e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3057a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3058e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3059e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3060e6e9a74fSStefano Zampini       }
3061afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3062afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3063afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3064afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3065afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3066afb2bd1cSJunchao Zhang       }
3067afb2bd1cSJunchao Zhang      #endif
3068e6e9a74fSStefano Zampini     }
30699ae82921SPaul Mullowney 
3070afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3071aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3072afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
30735f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3074afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
30759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
30769566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
30779566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3078afb2bd1cSJunchao Zhang                                                matstruct->matDescr,
3079afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3080afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecYDescr,
3081afb2bd1cSJunchao Zhang                                                cusparse_scalartype,
3082afb2bd1cSJunchao Zhang                                                cusparsestruct->spmvAlg,
30835f80ce2aSJacob Faibussowitsch                                                &matstruct->cuSpMV[opA].spmvBufferSize));
30849566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3085afb2bd1cSJunchao Zhang 
3086afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3087afb2bd1cSJunchao Zhang       } else {
3088afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
30899566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
30909566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3091afb2bd1cSJunchao Zhang       }
3092afb2bd1cSJunchao Zhang 
30939566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3094afb2bd1cSJunchao Zhang                                   matstruct->alpha_one,
30953606e59fSJunchao Zhang                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3096afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecXDescr,
3097afb2bd1cSJunchao Zhang                                   beta,
3098afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecYDescr,
3099afb2bd1cSJunchao Zhang                                   cusparse_scalartype,
3100afb2bd1cSJunchao Zhang                                   cusparsestruct->spmvAlg,
31015f80ce2aSJacob Faibussowitsch                                   matstruct->cuSpMV[opA].spmvBuffer));
3102afb2bd1cSJunchao Zhang      #else
31037656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
31049566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3105a65300a6SPaul Mullowney                                        mat->num_rows, mat->num_cols,
3106afb2bd1cSJunchao Zhang                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3107aa372e3fSPaul Mullowney                                        mat->values->data().get(), mat->row_offsets->data().get(),
3108e6e9a74fSStefano Zampini                                        mat->column_indices->data().get(), xptr, beta,
31095f80ce2aSJacob Faibussowitsch                                        dptr));
3110afb2bd1cSJunchao Zhang      #endif
3111aa372e3fSPaul Mullowney     } else {
3112213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3113afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3114afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3115afb2bd1cSJunchao Zhang        #else
3116301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
31179566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3118afb2bd1cSJunchao Zhang                                          matstruct->alpha_one, matstruct->descr, hybMat,
3119e6e9a74fSStefano Zampini                                          xptr, beta,
31205f80ce2aSJacob Faibussowitsch                                          dptr));
3121afb2bd1cSJunchao Zhang        #endif
3122a65300a6SPaul Mullowney       }
3123aa372e3fSPaul Mullowney     }
31249566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3125aa372e3fSPaul Mullowney 
3126e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3127213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3128213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
31299566063dSJacob Faibussowitsch           PetscCall(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3130e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
31319566063dSJacob Faibussowitsch           PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
31327656d835SStefano Zampini         }
3133213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
31349566063dSJacob Faibussowitsch         PetscCall(VecSet_SeqCUDA(zz,0));
31357656d835SStefano Zampini       }
31367656d835SStefano Zampini 
3137213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3138213423ffSJunchao Zhang       if (compressed) {
31399566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
3140a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3141a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3142a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3143a0e72f99SJunchao Zhang          */
3144a0e72f99SJunchao Zhang        #if 0
3145a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3146a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3147a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3148e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3149c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3150a0e72f99SJunchao Zhang        #else
3151a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3152a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3153a0e72f99SJunchao Zhang        #endif
31549566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
3155e6e9a74fSStefano Zampini       }
3156e6e9a74fSStefano Zampini     } else {
3157e6e9a74fSStefano Zampini       if (yy && yy != zz) {
31589566063dSJacob Faibussowitsch         PetscCall(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3159e6e9a74fSStefano Zampini       }
3160e6e9a74fSStefano Zampini     }
31619566063dSJacob Faibussowitsch     PetscCall(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
31629566063dSJacob Faibussowitsch     if (yy == zz) PetscCall(VecCUDARestoreArray(zz,&zarray));
31639566063dSJacob Faibussowitsch     else PetscCall(VecCUDARestoreArrayWrite(zz,&zarray));
31649ae82921SPaul Mullowney   } catch(char *ex) {
316598921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
31669ae82921SPaul Mullowney   }
3167e6e9a74fSStefano Zampini   if (yy) {
31689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz));
3169e6e9a74fSStefano Zampini   } else {
31709566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3171e6e9a74fSStefano Zampini   }
31729ae82921SPaul Mullowney   PetscFunctionReturn(0);
31739ae82921SPaul Mullowney }
31749ae82921SPaul Mullowney 
31756fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3176ca45077fSPaul Mullowney {
3177ca45077fSPaul Mullowney   PetscFunctionBegin;
31789566063dSJacob Faibussowitsch   PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3179ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3180ca45077fSPaul Mullowney }
3181ca45077fSPaul Mullowney 
31826fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
31839ae82921SPaul Mullowney {
3184042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3185042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
31863fa6b06aSMark Adams 
3187042217e8SBarry Smith   PetscFunctionBegin;
31889566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_SeqAIJ(A,mode));
3189042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3190042217e8SBarry Smith 
31919566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
31929566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->deviceMat));
3193042217e8SBarry Smith     cusp->deviceMat = NULL;
3194042217e8SBarry Smith   }
31959ae82921SPaul Mullowney   PetscFunctionReturn(0);
31969ae82921SPaul Mullowney }
31979ae82921SPaul Mullowney 
31989ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3199e057df02SPaul Mullowney /*@
32009ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3201e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3202e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3203e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3204e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3205e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32069ae82921SPaul Mullowney 
3207d083f849SBarry Smith    Collective
32089ae82921SPaul Mullowney 
32099ae82921SPaul Mullowney    Input Parameters:
32109ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32119ae82921SPaul Mullowney .  m - number of rows
32129ae82921SPaul Mullowney .  n - number of columns
32139ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32149ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32150298fd71SBarry Smith          (possibly different for each row) or NULL
32169ae82921SPaul Mullowney 
32179ae82921SPaul Mullowney    Output Parameter:
32189ae82921SPaul Mullowney .  A - the matrix
32199ae82921SPaul Mullowney 
32209ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
32219ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
32229ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
32239ae82921SPaul Mullowney 
32249ae82921SPaul Mullowney    Notes:
32259ae82921SPaul Mullowney    If nnz is given then nz is ignored
32269ae82921SPaul Mullowney 
32279ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
32289ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
32299ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
32309ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
32319ae82921SPaul Mullowney 
32329ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
32330298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
32349ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
32359ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
32369ae82921SPaul Mullowney 
32379ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
32389ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
32399ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
32409ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
32419ae82921SPaul Mullowney 
32429ae82921SPaul Mullowney    Level: intermediate
32439ae82921SPaul Mullowney 
3244e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
32459ae82921SPaul Mullowney @*/
32469ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
32479ae82921SPaul Mullowney {
32489ae82921SPaul Mullowney   PetscFunctionBegin;
32499566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
32509566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,m,n));
32519566063dSJacob Faibussowitsch   PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
32529566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
32539ae82921SPaul Mullowney   PetscFunctionReturn(0);
32549ae82921SPaul Mullowney }
32559ae82921SPaul Mullowney 
32566fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
32579ae82921SPaul Mullowney {
32589ae82921SPaul Mullowney   PetscFunctionBegin;
32599ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
32609566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
32619ae82921SPaul Mullowney   } else {
32629566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3263aa372e3fSPaul Mullowney   }
32649566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
32659566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
32669566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
32679566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
32689566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
32699566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
32709566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
32719566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
32729566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
32739566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
32749566063dSJacob Faibussowitsch   PetscCall(MatDestroy_SeqAIJ(A));
32759ae82921SPaul Mullowney   PetscFunctionReturn(0);
32769ae82921SPaul Mullowney }
32779ae82921SPaul Mullowney 
3278ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
327995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
32809ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
32819ff858a8SKarl Rupp {
32829ff858a8SKarl Rupp   PetscFunctionBegin;
32839566063dSJacob Faibussowitsch   PetscCall(MatDuplicate_SeqAIJ(A,cpvalues,B));
32849566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
32859ff858a8SKarl Rupp   PetscFunctionReturn(0);
32869ff858a8SKarl Rupp }
32879ff858a8SKarl Rupp 
3288039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
328995639643SRichard Tran Mills {
3290a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3291039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3292039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3293039c6fbaSStefano Zampini   PetscScalar        *ay;
3294039c6fbaSStefano Zampini   const PetscScalar  *ax;
3295039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3296e6e9a74fSStefano Zampini 
329795639643SRichard Tran Mills   PetscFunctionBegin;
3298a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3299a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3300039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
33019566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
33029566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3303a587d139SMark     PetscFunctionReturn(0);
330495639643SRichard Tran Mills   }
3305039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
33069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y));
33079566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(X));
33085f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
33095f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3310039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3311039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3312039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3313039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3314039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3315039c6fbaSStefano Zampini     if (eq) {
3316039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3317039c6fbaSStefano Zampini     }
3318039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3319039c6fbaSStefano Zampini   }
3320d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3321d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3322039c6fbaSStefano Zampini 
3323039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3324039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3325039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3326039c6fbaSStefano Zampini     size_t      bufferSize;
3327039c6fbaSStefano Zampini     void        *buffer;
3328039c6fbaSStefano Zampini #endif
3329039c6fbaSStefano Zampini 
33309566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33319566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33329566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3333039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33349566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3335039c6fbaSStefano Zampini                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3336039c6fbaSStefano Zampini                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33375f80ce2aSJacob Faibussowitsch                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
33389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc(&buffer,bufferSize));
33399566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33409566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3341039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3342039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33435f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
33449566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
33459566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
33469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(buffer));
3347039c6fbaSStefano Zampini #else
33489566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33499566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3350039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3351039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33525f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
33539566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(x->nz + y->nz));
33549566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
3355039c6fbaSStefano Zampini #endif
33569566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
33579566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33589566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33599566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3360039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3361a587d139SMark     cublasHandle_t cublasv2handle;
3362a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3363039c6fbaSStefano Zampini 
33649566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33659566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33669566063dSJacob Faibussowitsch     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
33679566063dSJacob Faibussowitsch     PetscCall(PetscBLASIntCast(x->nz,&bnz));
33689566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
33699566063dSJacob Faibussowitsch     PetscCallCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
33709566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuFlops(2.0*bnz));
33719566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
33729566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33739566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33749566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJInvalidateDiagonal(Y));
3375039c6fbaSStefano Zampini   } else {
33769566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
33779566063dSJacob Faibussowitsch     PetscCall(MatAXPY_SeqAIJ(Y,a,X,str));
3378a587d139SMark   }
337995639643SRichard Tran Mills   PetscFunctionReturn(0);
338095639643SRichard Tran Mills }
338195639643SRichard Tran Mills 
338233c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
338333c9ba73SStefano Zampini {
338433c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
338533c9ba73SStefano Zampini   PetscScalar    *ay;
338633c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
338733c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
338833c9ba73SStefano Zampini 
338933c9ba73SStefano Zampini   PetscFunctionBegin;
33909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33919566063dSJacob Faibussowitsch   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
33929566063dSJacob Faibussowitsch   PetscCall(PetscBLASIntCast(y->nz,&bnz));
33939566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
33949566063dSJacob Faibussowitsch   PetscCallCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
33959566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuFlops(bnz));
33969566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
33979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33989566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(Y));
339933c9ba73SStefano Zampini   PetscFunctionReturn(0);
340033c9ba73SStefano Zampini }
340133c9ba73SStefano Zampini 
34023fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34033fa6b06aSMark Adams {
34047e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3405a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34067e8381f9SStefano Zampini 
34073fa6b06aSMark Adams   PetscFunctionBegin;
34083fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
34093fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
34107e8381f9SStefano Zampini     if (spptr->mat) {
34117e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
34127e8381f9SStefano Zampini       if (matrix->values) {
34137e8381f9SStefano Zampini         both = PETSC_TRUE;
34147e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34157e8381f9SStefano Zampini       }
34167e8381f9SStefano Zampini     }
34177e8381f9SStefano Zampini     if (spptr->matTranspose) {
34187e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
34197e8381f9SStefano Zampini       if (matrix->values) {
34207e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34217e8381f9SStefano Zampini       }
34227e8381f9SStefano Zampini     }
34233fa6b06aSMark Adams   }
34249566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->i[A->rmap->n]));
34259566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
34267e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3427a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
34283fa6b06aSMark Adams   PetscFunctionReturn(0);
34293fa6b06aSMark Adams }
34303fa6b06aSMark Adams 
3431a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3432a587d139SMark {
3433a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3434a587d139SMark 
3435a587d139SMark   PetscFunctionBegin;
34369a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
34379a14fc28SStefano Zampini     A->boundtocpu = flg;
34389a14fc28SStefano Zampini     PetscFunctionReturn(0);
34399a14fc28SStefano Zampini   }
3440a587d139SMark   if (flg) {
34419566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A));
3442a587d139SMark 
344333c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3444a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3445a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3446a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3447a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3448a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3449a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3450a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3451a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3452fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
34539566063dSJacob Faibussowitsch     PetscCall(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
34549566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
34559566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
34569566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
34579566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
34589566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
34599566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
34609566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3461a587d139SMark   } else {
346233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3463a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3464a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3465a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3466a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3467a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3468a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3469a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3470a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3471fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
347267a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
347367a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
347467a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
347567a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
347667a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
347767a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
34787ee59b9bSJunchao Zhang     a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE;
34797ee59b9bSJunchao Zhang 
34809566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
34819566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34829566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34839566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
34849566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
34859566063dSJacob Faibussowitsch     PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3486a587d139SMark    }
3487a587d139SMark   A->boundtocpu = flg;
3488ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3489ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3490ea500dcfSRichard Tran Mills   } else {
3491ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3492ea500dcfSRichard Tran Mills   }
3493a587d139SMark   PetscFunctionReturn(0);
3494a587d139SMark }
3495a587d139SMark 
349649735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
34979ae82921SPaul Mullowney {
349849735bf3SStefano Zampini   Mat              B;
34999ae82921SPaul Mullowney 
35009ae82921SPaul Mullowney   PetscFunctionBegin;
35019566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
350249735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
35039566063dSJacob Faibussowitsch     PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat));
350449735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
35059566063dSJacob Faibussowitsch     PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
350649735bf3SStefano Zampini   }
350749735bf3SStefano Zampini   B = *newmat;
350849735bf3SStefano Zampini 
35099566063dSJacob Faibussowitsch   PetscCall(PetscFree(B->defaultvectype));
35109566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA,&B->defaultvectype));
351134136279SStefano Zampini 
351249735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
35139ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3514e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
35159566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
35169566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
35179566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
35181a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3519d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
35208efa179dSJose E. Roman      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3521a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3522a435da06SStefano Zampini      #else
3523d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3524a435da06SStefano Zampini      #endif
3525d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3526d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3527d8132acaSStefano Zampini      #endif
35281a2c6b5cSJunchao Zhang       B->spptr = spptr;
35299ae82921SPaul Mullowney     } else {
3530e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3531e6e9a74fSStefano Zampini 
35329566063dSJacob Faibussowitsch       PetscCall(PetscNew(&spptr));
35339566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseCreate(&spptr->handle));
35349566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3535e6e9a74fSStefano Zampini       B->spptr = spptr;
35369ae82921SPaul Mullowney     }
3537e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
353849735bf3SStefano Zampini   }
3539693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
35409ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
35411a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
35429ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
354395639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3544693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
35452205254eSKarl Rupp 
35469566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
35479566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
35489566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3549ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
35509566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3551ae48a8d0SStefano Zampini #endif
35529566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
35539ae82921SPaul Mullowney   PetscFunctionReturn(0);
35549ae82921SPaul Mullowney }
35559ae82921SPaul Mullowney 
355602fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
355702fe1965SBarry Smith {
355802fe1965SBarry Smith   PetscFunctionBegin;
35599566063dSJacob Faibussowitsch   PetscCall(MatCreate_SeqAIJ(B));
35609566063dSJacob Faibussowitsch   PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
356102fe1965SBarry Smith   PetscFunctionReturn(0);
356202fe1965SBarry Smith }
356302fe1965SBarry Smith 
35643ca39a21SBarry Smith /*MC
3565e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3566e057df02SPaul Mullowney 
3567e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
35682692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
35692692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3570e057df02SPaul Mullowney 
3571e057df02SPaul Mullowney    Options Database Keys:
3572e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3573aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3574a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3575365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3576e057df02SPaul Mullowney 
3577e057df02SPaul Mullowney   Level: beginner
3578e057df02SPaul Mullowney 
35798468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3580e057df02SPaul Mullowney M*/
35817f756511SDominic Meiser 
3582bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
35830f39cd5aSBarry Smith 
35843ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
358542c9c57cSBarry Smith {
358642c9c57cSBarry Smith   PetscFunctionBegin;
35879566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
35889566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
35899566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
35909566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
35919566063dSJacob Faibussowitsch   PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3592bddcd29dSMark Adams 
359342c9c57cSBarry Smith   PetscFunctionReturn(0);
359442c9c57cSBarry Smith }
359529b38603SBarry Smith 
3596cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3597cbc6b225SStefano Zampini {
3598cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3599cbc6b225SStefano Zampini 
3600cbc6b225SStefano Zampini   PetscFunctionBegin;
3601cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3602cbc6b225SStefano Zampini   delete cusp->cooPerm;
3603cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3604cbc6b225SStefano Zampini   cusp->cooPerm = NULL;
3605cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3606cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
36079566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->jmap_d));
36089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusp->perm_d));
3609cbc6b225SStefano Zampini   }
3610cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3611cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3612cbc6b225SStefano Zampini }
3613cbc6b225SStefano Zampini 
3614470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36157f756511SDominic Meiser {
36167f756511SDominic Meiser   PetscFunctionBegin;
36177f756511SDominic Meiser   if (*cusparsestruct) {
36189566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
36199566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
36207f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
362181902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
36227e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
36237e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3624a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
36259566063dSJacob Faibussowitsch     if ((*cusparsestruct)->handle) PetscCallCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
36269566063dSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) PetscCallCUDA(cudaFree((*cusparsestruct)->jmap_d));
36279566063dSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) PetscCallCUDA(cudaFree((*cusparsestruct)->perm_d));
36289566063dSJacob Faibussowitsch     PetscCall(PetscFree(*cusparsestruct));
36297f756511SDominic Meiser   }
36307f756511SDominic Meiser   PetscFunctionReturn(0);
36317f756511SDominic Meiser }
36327f756511SDominic Meiser 
36337f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
36347f756511SDominic Meiser {
36357f756511SDominic Meiser   PetscFunctionBegin;
36367f756511SDominic Meiser   if (*mat) {
36377f756511SDominic Meiser     delete (*mat)->values;
36387f756511SDominic Meiser     delete (*mat)->column_indices;
36397f756511SDominic Meiser     delete (*mat)->row_offsets;
36407f756511SDominic Meiser     delete *mat;
36417f756511SDominic Meiser     *mat = 0;
36427f756511SDominic Meiser   }
36437f756511SDominic Meiser   PetscFunctionReturn(0);
36447f756511SDominic Meiser }
36457f756511SDominic Meiser 
3646470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
36477f756511SDominic Meiser {
36487f756511SDominic Meiser   PetscFunctionBegin;
36497f756511SDominic Meiser   if (*trifactor) {
36509566063dSJacob Faibussowitsch     if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
36519566063dSJacob Faibussowitsch     if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
36529566063dSJacob Faibussowitsch     PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat));
36539566063dSJacob Faibussowitsch     if ((*trifactor)->solveBuffer)   PetscCallCUDA(cudaFree((*trifactor)->solveBuffer));
36549566063dSJacob Faibussowitsch     if ((*trifactor)->AA_h)   PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h));
3655afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
36569566063dSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3657afb2bd1cSJunchao Zhang    #endif
36589566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactor));
36597f756511SDominic Meiser   }
36607f756511SDominic Meiser   PetscFunctionReturn(0);
36617f756511SDominic Meiser }
36627f756511SDominic Meiser 
3663470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
36647f756511SDominic Meiser {
36657f756511SDominic Meiser   CsrMatrix        *mat;
36667f756511SDominic Meiser 
36677f756511SDominic Meiser   PetscFunctionBegin;
36687f756511SDominic Meiser   if (*matstruct) {
36697f756511SDominic Meiser     if ((*matstruct)->mat) {
36707f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3671afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3672afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3673afb2bd1cSJunchao Zhang        #else
36747f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
36759566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat));
3676afb2bd1cSJunchao Zhang        #endif
36777f756511SDominic Meiser       } else {
36787f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
36797f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
36807f756511SDominic Meiser       }
36817f756511SDominic Meiser     }
36829566063dSJacob Faibussowitsch     if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
36837f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
36849566063dSJacob Faibussowitsch     if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one));
36859566063dSJacob Faibussowitsch     if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero));
36869566063dSJacob Faibussowitsch     if ((*matstruct)->beta_one)  PetscCallCUDA(cudaFree((*matstruct)->beta_one));
3687afb2bd1cSJunchao Zhang 
3688afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3689afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
36909566063dSJacob Faibussowitsch     if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3691afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3692afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
36939566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
36949566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
36959566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3696afb2bd1cSJunchao Zhang       }
3697afb2bd1cSJunchao Zhang     }
3698afb2bd1cSJunchao Zhang    #endif
36997f756511SDominic Meiser     delete *matstruct;
37007e8381f9SStefano Zampini     *matstruct = NULL;
37017f756511SDominic Meiser   }
37027f756511SDominic Meiser   PetscFunctionReturn(0);
37037f756511SDominic Meiser }
37047f756511SDominic Meiser 
3705e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37067f756511SDominic Meiser {
37077f756511SDominic Meiser   PetscFunctionBegin;
37087f756511SDominic Meiser   if (*trifactors) {
37099566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
37109566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
37119566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
37129566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
37137f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
37147f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
37157f756511SDominic Meiser     delete (*trifactors)->workVector;
37167e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
37177e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
37187e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
37199566063dSJacob Faibussowitsch     if ((*trifactors)->a_band_d)   PetscCallCUDA(cudaFree((*trifactors)->a_band_d));
37209566063dSJacob Faibussowitsch     if ((*trifactors)->i_band_d)   PetscCallCUDA(cudaFree((*trifactors)->i_band_d));
3721e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3722ccdfe979SStefano Zampini   }
3723ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3724ccdfe979SStefano Zampini }
3725ccdfe979SStefano Zampini 
3726ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3727ccdfe979SStefano Zampini {
3728ccdfe979SStefano Zampini   cusparseHandle_t handle;
3729ccdfe979SStefano Zampini 
3730ccdfe979SStefano Zampini   PetscFunctionBegin;
3731ccdfe979SStefano Zampini   if (*trifactors) {
37329566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
37337f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
37349566063dSJacob Faibussowitsch       PetscCallCUSPARSE(cusparseDestroy(handle));
37357f756511SDominic Meiser     }
37369566063dSJacob Faibussowitsch     PetscCall(PetscFree(*trifactors));
37377f756511SDominic Meiser   }
37387f756511SDominic Meiser   PetscFunctionReturn(0);
37397f756511SDominic Meiser }
37407e8381f9SStefano Zampini 
37417e8381f9SStefano Zampini struct IJCompare
37427e8381f9SStefano Zampini {
37437e8381f9SStefano Zampini   __host__ __device__
37447e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37457e8381f9SStefano Zampini   {
37467e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
37477e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
37487e8381f9SStefano Zampini     return false;
37497e8381f9SStefano Zampini   }
37507e8381f9SStefano Zampini };
37517e8381f9SStefano Zampini 
37527e8381f9SStefano Zampini struct IJEqual
37537e8381f9SStefano Zampini {
37547e8381f9SStefano Zampini   __host__ __device__
37557e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37567e8381f9SStefano Zampini   {
37577e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
37587e8381f9SStefano Zampini     return true;
37597e8381f9SStefano Zampini   }
37607e8381f9SStefano Zampini };
37617e8381f9SStefano Zampini 
37627e8381f9SStefano Zampini struct IJDiff
37637e8381f9SStefano Zampini {
37647e8381f9SStefano Zampini   __host__ __device__
37657e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37667e8381f9SStefano Zampini   {
37677e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
37687e8381f9SStefano Zampini   }
37697e8381f9SStefano Zampini };
37707e8381f9SStefano Zampini 
37717e8381f9SStefano Zampini struct IJSum
37727e8381f9SStefano Zampini {
37737e8381f9SStefano Zampini   __host__ __device__
37747e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37757e8381f9SStefano Zampini   {
37767e8381f9SStefano Zampini     return t1||t2;
37777e8381f9SStefano Zampini   }
37787e8381f9SStefano Zampini };
37797e8381f9SStefano Zampini 
37807e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3781219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3782219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
37837e8381f9SStefano Zampini {
37847e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3785fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3786bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
378708391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
37887e8381f9SStefano Zampini   CsrMatrix                             *matrix;
37897e8381f9SStefano Zampini   PetscInt                              n;
37907e8381f9SStefano Zampini 
37917e8381f9SStefano Zampini   PetscFunctionBegin;
379228b400f6SJacob Faibussowitsch   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
379328b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
37947e8381f9SStefano Zampini   if (!cusp->cooPerm) {
37959566063dSJacob Faibussowitsch     PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
37969566063dSJacob Faibussowitsch     PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
37977e8381f9SStefano Zampini     PetscFunctionReturn(0);
37987e8381f9SStefano Zampini   }
37997e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
380028b400f6SJacob Faibussowitsch   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3801e61fc153SStefano Zampini   if (!v) {
3802e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3803e61fc153SStefano Zampini     goto finalize;
38047e8381f9SStefano Zampini   }
3805e61fc153SStefano Zampini   n = cusp->cooPerm->size();
380608391a17SStefano Zampini   if (isCudaMem(v)) {
380708391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
380808391a17SStefano Zampini   } else {
3809e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3810e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
381108391a17SStefano Zampini     d_v = cooPerm_v->data();
38129566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
381308391a17SStefano Zampini   }
38149566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
3815e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3816ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3817bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
381808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3819ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3820ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3821ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3822ddea5d60SJunchao Zhang       */
3823e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3824e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3825e61fc153SStefano Zampini       delete cooPerm_w;
38267e8381f9SStefano Zampini     } else {
3827ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
382808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38297e8381f9SStefano Zampini                                                                 matrix->values->begin()));
383008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38317e8381f9SStefano Zampini                                                                 matrix->values->end()));
3832ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
38337e8381f9SStefano Zampini     }
38347e8381f9SStefano Zampini   } else {
3835e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
383608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3837e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
38387e8381f9SStefano Zampini     } else {
383908391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38407e8381f9SStefano Zampini                                                                 matrix->values->begin()));
384108391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38427e8381f9SStefano Zampini                                                                 matrix->values->end()));
38437e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
38447e8381f9SStefano Zampini     }
38457e8381f9SStefano Zampini   }
38469566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
3847e61fc153SStefano Zampini finalize:
3848e61fc153SStefano Zampini   delete cooPerm_v;
38497e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
38509566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
3851fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
38529566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
38539566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
38549566063dSJacob Faibussowitsch   PetscCall(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3855fcdce8c4SStefano Zampini   a->reallocs         = 0;
3856fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3857fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3858fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3859fcdce8c4SStefano Zampini   A->num_ass++;
38607e8381f9SStefano Zampini   PetscFunctionReturn(0);
38617e8381f9SStefano Zampini }
38627e8381f9SStefano Zampini 
3863a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3864a49f1ed0SStefano Zampini {
3865a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3866a49f1ed0SStefano Zampini 
3867a49f1ed0SStefano Zampini   PetscFunctionBegin;
3868a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3869a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3870a49f1ed0SStefano Zampini   if (destroy) {
38719566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3872a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3873a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3874a49f1ed0SStefano Zampini   }
38751a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3876a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3877a49f1ed0SStefano Zampini }
3878a49f1ed0SStefano Zampini 
38797e8381f9SStefano Zampini #include <thrust/binary_search.h>
3880219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3881219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
38827e8381f9SStefano Zampini {
38837e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
38847e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
38857e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
38867e8381f9SStefano Zampini 
38877e8381f9SStefano Zampini   PetscFunctionBegin;
38889566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->rmap));
38899566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(A->cmap));
38907e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
38917e8381f9SStefano Zampini   if (n != cooPerm_n) {
38927e8381f9SStefano Zampini     delete cusp->cooPerm;
38937e8381f9SStefano Zampini     delete cusp->cooPerm_a;
38947e8381f9SStefano Zampini     cusp->cooPerm = NULL;
38957e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
38967e8381f9SStefano Zampini   }
38977e8381f9SStefano Zampini   if (n) {
38987e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
38997e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39007e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39017e8381f9SStefano Zampini 
39027e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
39037e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39047e8381f9SStefano Zampini 
39059566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
39067e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
39077e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
3908ddea5d60SJunchao Zhang 
3909ddea5d60SJunchao Zhang     /* Ex.
3910ddea5d60SJunchao Zhang       n = 6
3911ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
3912ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
3913ddea5d60SJunchao Zhang     */
39147e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
39157e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
39167e8381f9SStefano Zampini 
39179566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
39187e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3919ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3920ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
39217e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
39227e8381f9SStefano Zampini 
3923ddea5d60SJunchao Zhang     /*
3924ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
3925ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
3926ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
3927ddea5d60SJunchao Zhang     */
3928ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3929ddea5d60SJunchao Zhang 
3930ddea5d60SJunchao Zhang     /*
3931ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
3932ddea5d60SJunchao Zhang                             ^ekey
3933ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
3934ddea5d60SJunchao Zhang                            ^nekye
3935ddea5d60SJunchao Zhang     */
39367e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
39377e8381f9SStefano Zampini       delete cusp->cooPerm_a;
39387e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
3939ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3940ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3941ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3942ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3943ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
39447e8381f9SStefano Zampini       w[0] = 0;
3945ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3946ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
39477e8381f9SStefano Zampini     }
39487e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
3949ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3950ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3951ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
39529566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
39537e8381f9SStefano Zampini 
39549566063dSJacob Faibussowitsch     PetscCall(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
39557e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
39567e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
39577e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
39589566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(A->rmap->n+1,&a->i));
3959ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
39609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39617e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3962fcdce8c4SStefano Zampini     a->rmax = 0;
39639566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->a));
39649566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(a->nz,&a->j));
39659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39669566063dSJacob Faibussowitsch     if (!a->ilen) PetscCall(PetscMalloc1(A->rmap->n,&a->ilen));
39679566063dSJacob Faibussowitsch     if (!a->imax) PetscCall(PetscMalloc1(A->rmap->n,&a->imax));
39687e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
39697e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
39707e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
39717e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3972fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
39737e8381f9SStefano Zampini     }
3974fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
39757e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
39769566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
39779566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(A));
39787e8381f9SStefano Zampini   } else {
39799566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(A,0,NULL));
39807e8381f9SStefano Zampini   }
39819566063dSJacob Faibussowitsch   PetscCall(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
39827e8381f9SStefano Zampini 
39837e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3984e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
39859566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(a->a,a->nz));
39869566063dSJacob Faibussowitsch   PetscCall(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
39877e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
39889566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
39899566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
39907e8381f9SStefano Zampini   PetscFunctionReturn(0);
39917e8381f9SStefano Zampini }
3992ed502f03SStefano Zampini 
3993219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
3994219fbbafSJunchao Zhang {
3995219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
3996219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
3997cbc6b225SStefano Zampini   PetscBool          coo_basic = PETSC_TRUE;
3998219fbbafSJunchao Zhang   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
3999219fbbafSJunchao Zhang 
4000219fbbafSJunchao Zhang   PetscFunctionBegin;
40019566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJ(mat));
40029566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4003219fbbafSJunchao Zhang   if (coo_i) {
40049566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i,&mtype));
4005219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4006219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) {
4007cbc6b225SStefano Zampini         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4008219fbbafSJunchao Zhang       }
4009219fbbafSJunchao Zhang     }
4010219fbbafSJunchao Zhang   }
4011219fbbafSJunchao Zhang 
4012219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
40139566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4014219fbbafSJunchao Zhang   } else {
40159566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4016cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
40179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat));
4018219fbbafSJunchao Zhang     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4019219fbbafSJunchao Zhang     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
40209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
40219566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
40229566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
40239566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4024219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4025219fbbafSJunchao Zhang   }
4026219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4027219fbbafSJunchao Zhang }
4028219fbbafSJunchao Zhang 
4029b6c38306SJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4030219fbbafSJunchao Zhang {
4031219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4032219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
4033b6c38306SJunchao Zhang   for (; i<nnz; i+= grid_size) {
4034b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4035b6c38306SJunchao Zhang     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4036b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4037b6c38306SJunchao Zhang   }
4038219fbbafSJunchao Zhang }
4039219fbbafSJunchao Zhang 
4040219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4041219fbbafSJunchao Zhang {
4042219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4043219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4044219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4045219fbbafSJunchao Zhang   PetscMemType        memtype;
4046219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4047219fbbafSJunchao Zhang   PetscScalar         *Aa;
4048219fbbafSJunchao Zhang 
4049219fbbafSJunchao Zhang   PetscFunctionBegin;
4050219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
40519566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v,&memtype));
4052219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
40539566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
40549566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4055219fbbafSJunchao Zhang     }
4056219fbbafSJunchao Zhang 
40579566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
40589566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4059219fbbafSJunchao Zhang 
4060cbc6b225SStefano Zampini     if (Annz) {
4061b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
40629566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
4063cbc6b225SStefano Zampini     }
4064219fbbafSJunchao Zhang 
40659566063dSJacob Faibussowitsch     if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
40669566063dSJacob Faibussowitsch     else PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4067219fbbafSJunchao Zhang 
40689566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
4069219fbbafSJunchao Zhang   } else {
40709566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4071219fbbafSJunchao Zhang   }
4072219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4073219fbbafSJunchao Zhang }
4074219fbbafSJunchao Zhang 
40755b7e41feSStefano Zampini /*@C
40765b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40775b7e41feSStefano Zampini 
40785b7e41feSStefano Zampini    Not collective
40795b7e41feSStefano Zampini 
40805b7e41feSStefano Zampini     Input Parameters:
40815b7e41feSStefano Zampini +   A - the matrix
40825b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40835b7e41feSStefano Zampini 
40845b7e41feSStefano Zampini     Output Parameters:
40855b7e41feSStefano Zampini +   ia - the CSR row pointers
40865b7e41feSStefano Zampini -   ja - the CSR column indices
40875b7e41feSStefano Zampini 
40885b7e41feSStefano Zampini     Level: developer
40895b7e41feSStefano Zampini 
40905b7e41feSStefano Zampini     Notes:
40915b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
40925b7e41feSStefano Zampini 
40935b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
40945b7e41feSStefano Zampini @*/
40955f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
40965f101d05SStefano Zampini {
40975f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
40985f101d05SStefano Zampini   CsrMatrix          *csr;
40995f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
41005f101d05SStefano Zampini 
41015f101d05SStefano Zampini   PetscFunctionBegin;
41025f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41035f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41045f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41052c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
410728b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41085f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41095f101d05SStefano Zampini   if (i) {
41105f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41115f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41125f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41135f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41149566063dSJacob Faibussowitsch         PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
41155f101d05SStefano Zampini       }
41165f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41175f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41185f101d05SStefano Zampini   }
41195f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41205f101d05SStefano Zampini   PetscFunctionReturn(0);
41215f101d05SStefano Zampini }
41225f101d05SStefano Zampini 
41235b7e41feSStefano Zampini /*@C
41245b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41255b7e41feSStefano Zampini 
41265b7e41feSStefano Zampini    Not collective
41275b7e41feSStefano Zampini 
41285b7e41feSStefano Zampini     Input Parameters:
41295b7e41feSStefano Zampini +   A - the matrix
41305b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41315b7e41feSStefano Zampini 
41325b7e41feSStefano Zampini     Output Parameters:
41335b7e41feSStefano Zampini +   ia - the CSR row pointers
41345b7e41feSStefano Zampini -   ja - the CSR column indices
41355b7e41feSStefano Zampini 
41365b7e41feSStefano Zampini     Level: developer
41375b7e41feSStefano Zampini 
41385b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41395b7e41feSStefano Zampini @*/
41405f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41415f101d05SStefano Zampini {
41425f101d05SStefano Zampini   PetscFunctionBegin;
41435f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41445f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41455f101d05SStefano Zampini   if (i) *i = NULL;
41465f101d05SStefano Zampini   if (j) *j = NULL;
41475f101d05SStefano Zampini   PetscFunctionReturn(0);
41485f101d05SStefano Zampini }
41495f101d05SStefano Zampini 
41505b7e41feSStefano Zampini /*@C
41515b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41525b7e41feSStefano Zampini 
41535b7e41feSStefano Zampini    Not Collective
41545b7e41feSStefano Zampini 
41555b7e41feSStefano Zampini    Input Parameter:
41565b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41575b7e41feSStefano Zampini 
41585b7e41feSStefano Zampini    Output Parameter:
41595b7e41feSStefano Zampini .   a - pointer to the device data
41605b7e41feSStefano Zampini 
41615b7e41feSStefano Zampini    Level: developer
41625b7e41feSStefano Zampini 
41635b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41645b7e41feSStefano Zampini 
41655b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41665b7e41feSStefano Zampini @*/
4167ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4168ed502f03SStefano Zampini {
4169ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4170ed502f03SStefano Zampini   CsrMatrix          *csr;
4171ed502f03SStefano Zampini 
4172ed502f03SStefano Zampini   PetscFunctionBegin;
4173ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4174ed502f03SStefano Zampini   PetscValidPointer(a,2);
4175ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41762c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
417828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4179ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
418028b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4181ed502f03SStefano Zampini   *a = csr->values->data().get();
4182ed502f03SStefano Zampini   PetscFunctionReturn(0);
4183ed502f03SStefano Zampini }
4184ed502f03SStefano Zampini 
41855b7e41feSStefano Zampini /*@C
41865b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
41875b7e41feSStefano Zampini 
41885b7e41feSStefano Zampini    Not Collective
41895b7e41feSStefano Zampini 
41905b7e41feSStefano Zampini    Input Parameter:
41915b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41925b7e41feSStefano Zampini 
41935b7e41feSStefano Zampini    Output Parameter:
41945b7e41feSStefano Zampini .   a - pointer to the device data
41955b7e41feSStefano Zampini 
41965b7e41feSStefano Zampini    Level: developer
41975b7e41feSStefano Zampini 
41985b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
41995b7e41feSStefano Zampini @*/
4200ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4201ed502f03SStefano Zampini {
4202ed502f03SStefano Zampini   PetscFunctionBegin;
4203ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4204ed502f03SStefano Zampini   PetscValidPointer(a,2);
4205ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4206ed502f03SStefano Zampini   *a = NULL;
4207ed502f03SStefano Zampini   PetscFunctionReturn(0);
4208ed502f03SStefano Zampini }
4209ed502f03SStefano Zampini 
42105b7e41feSStefano Zampini /*@C
42115b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42125b7e41feSStefano Zampini 
42135b7e41feSStefano Zampini    Not Collective
42145b7e41feSStefano Zampini 
42155b7e41feSStefano Zampini    Input Parameter:
42165b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42175b7e41feSStefano Zampini 
42185b7e41feSStefano Zampini    Output Parameter:
42195b7e41feSStefano Zampini .   a - pointer to the device data
42205b7e41feSStefano Zampini 
42215b7e41feSStefano Zampini    Level: developer
42225b7e41feSStefano Zampini 
42235b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42245b7e41feSStefano Zampini 
42255b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
42265b7e41feSStefano Zampini @*/
4227039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4228039c6fbaSStefano Zampini {
4229039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4230039c6fbaSStefano Zampini   CsrMatrix          *csr;
4231039c6fbaSStefano Zampini 
4232039c6fbaSStefano Zampini   PetscFunctionBegin;
4233039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4234039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4235039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42362c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
42379566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
423828b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4239039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
424028b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4241039c6fbaSStefano Zampini   *a = csr->values->data().get();
4242039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
42439566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4244039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4245039c6fbaSStefano Zampini }
42465b7e41feSStefano Zampini /*@C
42475b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4248039c6fbaSStefano Zampini 
42495b7e41feSStefano Zampini    Not Collective
42505b7e41feSStefano Zampini 
42515b7e41feSStefano Zampini    Input Parameter:
42525b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42535b7e41feSStefano Zampini 
42545b7e41feSStefano Zampini    Output Parameter:
42555b7e41feSStefano Zampini .   a - pointer to the device data
42565b7e41feSStefano Zampini 
42575b7e41feSStefano Zampini    Level: developer
42585b7e41feSStefano Zampini 
42595b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42605b7e41feSStefano Zampini @*/
4261039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4262039c6fbaSStefano Zampini {
4263039c6fbaSStefano Zampini   PetscFunctionBegin;
4264039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4265039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4266039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42679566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
42689566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4269039c6fbaSStefano Zampini   *a = NULL;
4270039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4271039c6fbaSStefano Zampini }
4272039c6fbaSStefano Zampini 
42735b7e41feSStefano Zampini /*@C
42745b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42755b7e41feSStefano Zampini 
42765b7e41feSStefano Zampini    Not Collective
42775b7e41feSStefano Zampini 
42785b7e41feSStefano Zampini    Input Parameter:
42795b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42805b7e41feSStefano Zampini 
42815b7e41feSStefano Zampini    Output Parameter:
42825b7e41feSStefano Zampini .   a - pointer to the device data
42835b7e41feSStefano Zampini 
42845b7e41feSStefano Zampini    Level: developer
42855b7e41feSStefano Zampini 
42865b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
42875b7e41feSStefano Zampini 
42885b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
42895b7e41feSStefano Zampini @*/
4290ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4291ed502f03SStefano Zampini {
4292ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4293ed502f03SStefano Zampini   CsrMatrix          *csr;
4294ed502f03SStefano Zampini 
4295ed502f03SStefano Zampini   PetscFunctionBegin;
4296ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4297ed502f03SStefano Zampini   PetscValidPointer(a,2);
4298ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42992c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
430028b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4301ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
430228b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4303ed502f03SStefano Zampini   *a = csr->values->data().get();
4304039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
43059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4306ed502f03SStefano Zampini   PetscFunctionReturn(0);
4307ed502f03SStefano Zampini }
4308ed502f03SStefano Zampini 
43095b7e41feSStefano Zampini /*@C
43105b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43115b7e41feSStefano Zampini 
43125b7e41feSStefano Zampini    Not Collective
43135b7e41feSStefano Zampini 
43145b7e41feSStefano Zampini    Input Parameter:
43155b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43165b7e41feSStefano Zampini 
43175b7e41feSStefano Zampini    Output Parameter:
43185b7e41feSStefano Zampini .   a - pointer to the device data
43195b7e41feSStefano Zampini 
43205b7e41feSStefano Zampini    Level: developer
43215b7e41feSStefano Zampini 
43225b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
43235b7e41feSStefano Zampini @*/
4324ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4325ed502f03SStefano Zampini {
4326ed502f03SStefano Zampini   PetscFunctionBegin;
4327ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4328ed502f03SStefano Zampini   PetscValidPointer(a,2);
4329ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
43309566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJInvalidateDiagonal(A));
43319566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)A));
4332ed502f03SStefano Zampini   *a = NULL;
4333ed502f03SStefano Zampini   PetscFunctionReturn(0);
4334ed502f03SStefano Zampini }
4335ed502f03SStefano Zampini 
4336ed502f03SStefano Zampini struct IJCompare4
4337ed502f03SStefano Zampini {
4338ed502f03SStefano Zampini   __host__ __device__
43392ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4340ed502f03SStefano Zampini   {
4341ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4342ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4343ed502f03SStefano Zampini     return false;
4344ed502f03SStefano Zampini   }
4345ed502f03SStefano Zampini };
4346ed502f03SStefano Zampini 
43478909a122SStefano Zampini struct Shift
43488909a122SStefano Zampini {
4349ed502f03SStefano Zampini   int _shift;
4350ed502f03SStefano Zampini 
4351ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4352ed502f03SStefano Zampini   __host__ __device__
4353ed502f03SStefano Zampini   inline int operator() (const int &c)
4354ed502f03SStefano Zampini   {
4355ed502f03SStefano Zampini     return c + _shift;
4356ed502f03SStefano Zampini   }
4357ed502f03SStefano Zampini };
4358ed502f03SStefano Zampini 
4359ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4360ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4361ed502f03SStefano Zampini {
4362ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4363ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4364ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4365ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4366ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4367ed502f03SStefano Zampini   cusparseStatus_t             stat;
4368ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4369ed502f03SStefano Zampini 
4370ed502f03SStefano Zampini   PetscFunctionBegin;
4371ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4372ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4373ed502f03SStefano Zampini   PetscValidPointer(C,4);
4374ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4375ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
43765f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
437708401ef6SPierre Jolivet   PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
43782c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
43792c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4380ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4381ed502f03SStefano Zampini     m     = A->rmap->n;
4382ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
43839566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,C));
43849566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(*C,m,n,m,n));
43859566063dSJacob Faibussowitsch     PetscCall(MatSetType(*C,MATSEQAIJCUSPARSE));
4386ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4387ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4388ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4389ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4390ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4391ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4392ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4393ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4394ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4395ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4396ed502f03SStefano Zampini     Ccusp->nrows    = m;
4397ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4398ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4399ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4400ed502f03SStefano Zampini     Ccsr->num_cols  = n;
44019566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
44029566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
44039566063dSJacob Faibussowitsch     PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
44049566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
44059566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
44069566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
44079566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
44099566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44109566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
44119566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
441228b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
441328b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4414ed502f03SStefano Zampini 
4415ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4416ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4417ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4418ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4419ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4420ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4421ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4422ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4423ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4424ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4425ed502f03SStefano Zampini     if (c->nz) {
44262ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44272ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44282ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44292ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44302ed87e7eSStefano Zampini 
4431ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4432ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4433ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4434ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
44359566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4436ed502f03SStefano Zampini         }
44372ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44382ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4439ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4440ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4441ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4442ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
44439566063dSJacob Faibussowitsch           PetscCall(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4444ed502f03SStefano Zampini         }
44452ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44462ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
44479566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
44482ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44492ed87e7eSStefano Zampini                               Aroff->data().get(),
44502ed87e7eSStefano Zampini                               Annz,
44512ed87e7eSStefano Zampini                               m,
44522ed87e7eSStefano Zampini                               Acoo->data().get(),
44539566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
4454ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44552ed87e7eSStefano Zampini                               Broff->data().get(),
4456ed502f03SStefano Zampini                               Bnnz,
4457ed502f03SStefano Zampini                               m,
44582ed87e7eSStefano Zampini                               Bcoo->data().get(),
44599566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
44602ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44612ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44622ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44638909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4464ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4465ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44668909a122SStefano Zampini #else
44678909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44688909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44698909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44708909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44718909a122SStefano Zampini #endif
44722ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44732ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44742ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44752ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
44762ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
44772ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4478ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4479ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4480ed502f03SStefano Zampini       thrust::advance(p2,Annz);
44812ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
44828909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
44838909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
44848909a122SStefano Zampini #endif
44852ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
44862ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
44872ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
44882ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
44892ed87e7eSStefano Zampini #else
44902ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
44912ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
44922ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
44932ed87e7eSStefano Zampini #endif
4494ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
44952ed87e7eSStefano Zampini                               Ccoo->data().get(),
4496ed502f03SStefano Zampini                               c->nz,
4497ed502f03SStefano Zampini                               m,
4498ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
44999566063dSJacob Faibussowitsch                               CUSPARSE_INDEX_BASE_ZERO);PetscCallCUSPARSE(stat);
45009566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
45012ed87e7eSStefano Zampini       delete wPerm;
45022ed87e7eSStefano Zampini       delete Acoo;
45032ed87e7eSStefano Zampini       delete Bcoo;
45042ed87e7eSStefano Zampini       delete Ccoo;
4505ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4506ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4507ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4508ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
45099566063dSJacob Faibussowitsch                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4510ed502f03SStefano Zampini #endif
45111a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45129566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
45139566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4514ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4515ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4516ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4517ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4518ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4519ed502f03SStefano Zampini 
45201a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45211a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4522a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4523ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4524ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4525ed502f03SStefano Zampini         CcsrT->num_rows = n;
4526ed502f03SStefano Zampini         CcsrT->num_cols = m;
4527ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4528ed502f03SStefano Zampini 
4529ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4530ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4531ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4532ed502f03SStefano Zampini 
45339566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeBegin());
4534ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4535ed502f03SStefano Zampini         if (AT) {
4536ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4537ed502f03SStefano Zampini           thrust::advance(rT,-1);
4538ed502f03SStefano Zampini         }
4539ed502f03SStefano Zampini         if (BT) {
4540ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4541ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4542ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4543ed502f03SStefano Zampini         }
4544ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4545ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4546ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4547ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4548ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4549ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
45509566063dSJacob Faibussowitsch         PetscCall(PetscLogGpuTimeEnd());
4551ed502f03SStefano Zampini 
45529566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
45539566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
45549566063dSJacob Faibussowitsch         PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
45559566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
45569566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
45579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
45589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
45599566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
45609566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4561ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4562ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4563ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4564ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
45659566063dSJacob Faibussowitsch                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);PetscCallCUSPARSE(stat);
4566ed502f03SStefano Zampini #endif
4567ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4568ed502f03SStefano Zampini       }
4569ed502f03SStefano Zampini     }
4570ed502f03SStefano Zampini 
4571ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4572ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4573ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
45749566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m+1,&c->i));
45759566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->j));
4576ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4577ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4578ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4579ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4580ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
45819566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45829566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4583ed502f03SStefano Zampini     } else {
45849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45859566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4586ed502f03SStefano Zampini     }
45879566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
45889566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->ilen));
45899566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(m,&c->imax));
4590ed502f03SStefano Zampini     c->maxnz = c->nz;
4591ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4592ed502f03SStefano Zampini     c->rmax = 0;
4593ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4594ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4595ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4596ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4597ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4598ed502f03SStefano Zampini     }
45999566063dSJacob Faibussowitsch     PetscCall(MatMarkDiagonal_SeqAIJ(*C));
46009566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(c->nz,&c->a));
4601ed502f03SStefano Zampini     (*C)->nonzerostate++;
46029566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->rmap));
46039566063dSJacob Faibussowitsch     PetscCall(PetscLayoutSetUp((*C)->cmap));
4604ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4605ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4606ed502f03SStefano Zampini   } else {
460708401ef6SPierre Jolivet     PetscCheck((*C)->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4608ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4609ed502f03SStefano Zampini     if (c->nz) {
4610ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
46115f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
46122c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
461308401ef6SPierre Jolivet       PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
46149566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
46159566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(B));
46165f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
46175f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4618ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4619ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4620ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
46212c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
46222c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
46232c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
46242c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
46255f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4626ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4627ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
46289566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeBegin());
4629ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4630ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4631ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4632ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4633ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4634ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4635ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4636ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4637ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4638ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
46399566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
46401a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
46415f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4642ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4643ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4644ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4645ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4646ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4647ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4648ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46491a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4650ed502f03SStefano Zampini       }
46519566063dSJacob Faibussowitsch       PetscCall(PetscLogGpuTimeEnd());
4652ed502f03SStefano Zampini     }
4653ed502f03SStefano Zampini   }
46549566063dSJacob Faibussowitsch   PetscCall(PetscObjectStateIncrease((PetscObject)*C));
4655ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4656ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4657ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4658ed502f03SStefano Zampini   PetscFunctionReturn(0);
4659ed502f03SStefano Zampini }
4660c215019aSStefano Zampini 
4661c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4662c215019aSStefano Zampini {
4663c215019aSStefano Zampini   bool              dmem;
4664c215019aSStefano Zampini   const PetscScalar *av;
4665c215019aSStefano Zampini 
4666c215019aSStefano Zampini   PetscFunctionBegin;
4667c215019aSStefano Zampini   dmem = isCudaMem(v);
46689566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4669c215019aSStefano Zampini   if (n && idx) {
4670c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4671c215019aSStefano Zampini     widx.assign(idx,idx+n);
46729566063dSJacob Faibussowitsch     PetscCall(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4673c215019aSStefano Zampini 
4674c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4675c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4676c215019aSStefano Zampini     if (dmem) {
4677c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4678c215019aSStefano Zampini     } else {
4679c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4680c215019aSStefano Zampini       dv = w->data();
4681c215019aSStefano Zampini     }
4682c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4683c215019aSStefano Zampini 
4684c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4685c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4686c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4687c215019aSStefano Zampini     if (w) {
46889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4689c215019aSStefano Zampini     }
4690c215019aSStefano Zampini     delete w;
4691c215019aSStefano Zampini   } else {
46929566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4693c215019aSStefano Zampini   }
46949566063dSJacob Faibussowitsch   if (!dmem) PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
46959566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4696c215019aSStefano Zampini   PetscFunctionReturn(0);
4697c215019aSStefano Zampini }
4698