xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 28b400f66ebc7ae0049166a2294dfcd3df27e64b)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94c215019aSStefano Zampini 
95b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
96b06137fdSPaul Mullowney {
97b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98b06137fdSPaul Mullowney 
99b06137fdSPaul Mullowney   PetscFunctionBegin;
100*28b400f6SJacob Faibussowitsch   PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
1025f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream));
103b06137fdSPaul Mullowney   PetscFunctionReturn(0);
104b06137fdSPaul Mullowney }
105b06137fdSPaul Mullowney 
106b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107b06137fdSPaul Mullowney {
108b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109b06137fdSPaul Mullowney 
110b06137fdSPaul Mullowney   PetscFunctionBegin;
111*28b400f6SJacob Faibussowitsch   PetscCheck(cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1126b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11316a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
1145f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseDestroy(cusparsestruct->handle));
11516a2e217SAlejandro Lamas Daviña     }
116b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1176b1cf21dSAlejandro Lamas Daviña   }
1185f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
119b06137fdSPaul Mullowney   PetscFunctionReturn(0);
120b06137fdSPaul Mullowney }
121b06137fdSPaul Mullowney 
122b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123b06137fdSPaul Mullowney {
124b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1257e8381f9SStefano Zampini   PetscBool          flg;
126ccdfe979SStefano Zampini 
127b06137fdSPaul Mullowney   PetscFunctionBegin;
1285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
1297e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
130ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
131b06137fdSPaul Mullowney   PetscFunctionReturn(0);
132b06137fdSPaul Mullowney }
133b06137fdSPaul Mullowney 
134ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1359ae82921SPaul Mullowney {
1369ae82921SPaul Mullowney   PetscFunctionBegin;
1379ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1389ae82921SPaul Mullowney   PetscFunctionReturn(0);
1399ae82921SPaul Mullowney }
1409ae82921SPaul Mullowney 
141c708e6cdSJed Brown /*MC
142087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
143087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
144087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
145087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
146087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
147087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
148c708e6cdSJed Brown 
1499ae82921SPaul Mullowney   Level: beginner
150c708e6cdSJed Brown 
1513ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
152c708e6cdSJed Brown M*/
1539ae82921SPaul Mullowney 
15442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1559ae82921SPaul Mullowney {
156bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1579ae82921SPaul Mullowney 
1589ae82921SPaul Mullowney   PetscFunctionBegin;
1595f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(PetscObjectComm((PetscObject)A),B));
1605f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(*B,n,n,n,n));
1612c7c0729SBarry Smith   (*B)->factortype = ftype;
1625f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(*B,MATSEQAIJCUSPARSE));
1632205254eSKarl Rupp 
1645f80ce2aSJacob Faibussowitsch   if (A->boundtocpu && A->bindingpropagates) CHKERRQ(MatBindToCPU(*B,PETSC_TRUE));
165087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
1665f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetBlockSizesFromMats(*B,A,A));
1679c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1689ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1699ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1709c1083e7SRichard Tran Mills     } else {
1719c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1729c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1739c1083e7SRichard Tran Mills     }
1745f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]));
1755f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]));
1765f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
177087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1789c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
179087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
180087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1819c1083e7SRichard Tran Mills     } else {
1829c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1839c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1849c1083e7SRichard Tran Mills     }
1855f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
1865f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]));
1879ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
188bc3f50f2SPaul Mullowney 
1895f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL));
1904ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1915f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse));
1929ae82921SPaul Mullowney   PetscFunctionReturn(0);
1939ae82921SPaul Mullowney }
1949ae82921SPaul Mullowney 
195bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
196ca45077fSPaul Mullowney {
197aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1986e111a19SKarl Rupp 
199ca45077fSPaul Mullowney   PetscFunctionBegin;
200ca45077fSPaul Mullowney   switch (op) {
201e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
202aa372e3fSPaul Mullowney     cusparsestruct->format = format;
203ca45077fSPaul Mullowney     break;
204e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
205aa372e3fSPaul Mullowney     cusparsestruct->format = format;
206ca45077fSPaul Mullowney     break;
207ca45077fSPaul Mullowney   default:
20898921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
209ca45077fSPaul Mullowney   }
210ca45077fSPaul Mullowney   PetscFunctionReturn(0);
211ca45077fSPaul Mullowney }
2129ae82921SPaul Mullowney 
213e057df02SPaul Mullowney /*@
214e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
215e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
216aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
217e057df02SPaul Mullowney    Not Collective
218e057df02SPaul Mullowney 
219e057df02SPaul Mullowney    Input Parameters:
2208468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
22136d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2222692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
223e057df02SPaul Mullowney 
224e057df02SPaul Mullowney    Output Parameter:
225e057df02SPaul Mullowney 
226e057df02SPaul Mullowney    Level: intermediate
227e057df02SPaul Mullowney 
2288468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
229e057df02SPaul Mullowney @*/
230e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
231e057df02SPaul Mullowney {
232e057df02SPaul Mullowney   PetscFunctionBegin;
233e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
2345f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format)));
235e057df02SPaul Mullowney   PetscFunctionReturn(0);
236e057df02SPaul Mullowney }
237e057df02SPaul Mullowney 
238365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
239365b711fSMark Adams {
240365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
241365b711fSMark Adams 
242365b711fSMark Adams   PetscFunctionBegin;
243365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
244365b711fSMark Adams   PetscFunctionReturn(0);
245365b711fSMark Adams }
246365b711fSMark Adams 
247365b711fSMark Adams /*@
248365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
249365b711fSMark Adams 
250365b711fSMark Adams    Input Parameters:
251365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
252365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
253365b711fSMark Adams 
254365b711fSMark Adams    Output Parameter:
255365b711fSMark Adams 
256365b711fSMark Adams    Notes:
257365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
258365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
259365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
260365b711fSMark Adams 
261365b711fSMark Adams    Level: intermediate
262365b711fSMark Adams 
263365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
264365b711fSMark Adams @*/
265365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
266365b711fSMark Adams {
267365b711fSMark Adams   PetscFunctionBegin;
268365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
2695f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu)));
270365b711fSMark Adams   PetscFunctionReturn(0);
271365b711fSMark Adams }
272365b711fSMark Adams 
2731a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
274e6e9a74fSStefano Zampini {
275e6e9a74fSStefano Zampini   PetscFunctionBegin;
2761a2c6b5cSJunchao Zhang   switch (op) {
2771a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2781a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2795f80ce2aSJacob Faibussowitsch       if (A->form_explicit_transpose && !flg) CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
2801a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2811a2c6b5cSJunchao Zhang       break;
2821a2c6b5cSJunchao Zhang     default:
2835f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetOption_SeqAIJ(A,op,flg));
2841a2c6b5cSJunchao Zhang       break;
285e6e9a74fSStefano Zampini   }
286e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
287e6e9a74fSStefano Zampini }
288e6e9a74fSStefano Zampini 
289bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
290bddcd29dSMark Adams 
291bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
292bddcd29dSMark Adams {
293bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
294bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
295bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
296365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
297bddcd29dSMark Adams 
298bddcd29dSMark Adams   PetscFunctionBegin;
2995f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
3005f80ce2aSJacob Faibussowitsch   CHKERRQ(MatLUFactorNumeric_SeqAIJ(B,A,info));
301bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
302bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
3035f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(isrow,&row_identity));
3045f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(iscol,&col_identity));
305bddcd29dSMark Adams   if (row_identity && col_identity) {
306365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
307bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
308bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
309365b711fSMark Adams     }
310bddcd29dSMark Adams     B->ops->matsolve = NULL;
311bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
312bddcd29dSMark Adams   } else {
313365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
314bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
315bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
316365b711fSMark Adams     }
317bddcd29dSMark Adams     B->ops->matsolve = NULL;
318bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
319bddcd29dSMark Adams   }
320bddcd29dSMark Adams 
321bddcd29dSMark Adams   /* get the triangular factors */
322365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
3235f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B));
324365b711fSMark Adams   }
325bddcd29dSMark Adams   PetscFunctionReturn(0);
326bddcd29dSMark Adams }
327bddcd29dSMark Adams 
3284416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
3299ae82921SPaul Mullowney {
3309ae82921SPaul Mullowney   PetscErrorCode           ierr;
331e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
3329ae82921SPaul Mullowney   PetscBool                flg;
333a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3346e111a19SKarl Rupp 
3359ae82921SPaul Mullowney   PetscFunctionBegin;
3365f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options"));
3379ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
338e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
339a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
3405f80ce2aSJacob Faibussowitsch     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format));
341afb2bd1cSJunchao Zhang 
3424c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
343a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
3445f80ce2aSJacob Faibussowitsch     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
3455f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg));
3465f80ce2aSJacob Faibussowitsch     if (flg) CHKERRQ(MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve));
347afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
348afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
349afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
350afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
3518efa179dSJose E. Roman #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3522c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
353a435da06SStefano Zampini #else
3542c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
355a435da06SStefano Zampini #endif
356afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
357afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
3582c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
359afb2bd1cSJunchao Zhang 
360afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
361afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
3622c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
363afb2bd1cSJunchao Zhang    #endif
3644c87dfd4SPaul Mullowney   }
3655f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsTail());
3669ae82921SPaul Mullowney   PetscFunctionReturn(0);
3679ae82921SPaul Mullowney }
3689ae82921SPaul Mullowney 
3696fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3709ae82921SPaul Mullowney {
371da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney   PetscFunctionBegin;
3745f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3755f80ce2aSJacob Faibussowitsch   CHKERRQ(MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3769ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3779ae82921SPaul Mullowney   PetscFunctionReturn(0);
3789ae82921SPaul Mullowney }
3799ae82921SPaul Mullowney 
3806fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3819ae82921SPaul Mullowney {
382da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney   PetscFunctionBegin;
3855f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3865f80ce2aSJacob Faibussowitsch   CHKERRQ(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info));
3879ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3889ae82921SPaul Mullowney   PetscFunctionReturn(0);
3899ae82921SPaul Mullowney }
3909ae82921SPaul Mullowney 
391087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
392087f3262SPaul Mullowney {
393da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
394087f3262SPaul Mullowney 
395087f3262SPaul Mullowney   PetscFunctionBegin;
3965f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
3975f80ce2aSJacob Faibussowitsch   CHKERRQ(MatICCFactorSymbolic_SeqAIJ(B,A,perm,info));
398087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
399087f3262SPaul Mullowney   PetscFunctionReturn(0);
400087f3262SPaul Mullowney }
401087f3262SPaul Mullowney 
402087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
403087f3262SPaul Mullowney {
404da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
405087f3262SPaul Mullowney 
406087f3262SPaul Mullowney   PetscFunctionBegin;
4075f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors));
4085f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info));
409087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
410087f3262SPaul Mullowney   PetscFunctionReturn(0);
411087f3262SPaul Mullowney }
412087f3262SPaul Mullowney 
413087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
4149ae82921SPaul Mullowney {
4159ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4169ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4179ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
418aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
4199ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
4209ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4219ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
4229ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
4239ae82921SPaul Mullowney 
4249ae82921SPaul Mullowney   PetscFunctionBegin;
425cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
426c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4279ae82921SPaul Mullowney     try {
4289ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
4299ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
430da79fbbcSStefano Zampini       if (!loTriFactor) {
4312cbc15d9SMark         PetscScalar                       *AALo;
4322cbc15d9SMark 
4335f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar)));
4349ae82921SPaul Mullowney 
4359ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
4365f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt)));
4375f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt)));
4389ae82921SPaul Mullowney 
4399ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4409ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4419ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4429ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4439ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4449ae82921SPaul Mullowney         v        = aa;
4459ae82921SPaul Mullowney         vi       = aj;
4469ae82921SPaul Mullowney         offset   = 1;
4479ae82921SPaul Mullowney         rowOffset= 1;
4489ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4499ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
450e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4519ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4529ae82921SPaul Mullowney           rowOffset += nz+1;
4539ae82921SPaul Mullowney 
4545f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscArraycpy(&(AjLo[offset]), vi, nz));
4555f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscArraycpy(&(AALo[offset]), v, nz));
4569ae82921SPaul Mullowney 
4579ae82921SPaul Mullowney           offset      += nz;
4589ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4599ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4609ae82921SPaul Mullowney           offset      += 1;
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney           v  += nz;
4639ae82921SPaul Mullowney           vi += nz;
4649ae82921SPaul Mullowney         }
4652205254eSKarl Rupp 
466aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
4675f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscNew(&loTriFactor));
468da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
469aa372e3fSPaul Mullowney         /* Create the matrix description */
4705f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
4715f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
4721b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
4735f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
474afb2bd1cSJunchao Zhang        #else
4755f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
476afb2bd1cSJunchao Zhang        #endif
4775f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER));
4785f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
479aa372e3fSPaul Mullowney 
480aa372e3fSPaul Mullowney         /* set the operation */
481aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
482aa372e3fSPaul Mullowney 
483aa372e3fSPaul Mullowney         /* set the matrix */
484aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
485aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
486aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
487aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
488aa372e3fSPaul Mullowney 
489aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
490aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
491aa372e3fSPaul Mullowney 
492aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
493aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
494aa372e3fSPaul Mullowney 
495aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
496aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
497aa372e3fSPaul Mullowney 
498afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
4995f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
5005f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
5011b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
5025f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
503afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
504afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
505afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
5065f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
5075f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
508afb2bd1cSJunchao Zhang       #endif
509afb2bd1cSJunchao Zhang 
510aa372e3fSPaul Mullowney         /* perform the solve analysis */
5115f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
512aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
513aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
514d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
5151b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
516d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
5175f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
518d49cd2b7SBarry Smith                                          #else
5195f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
520afb2bd1cSJunchao Zhang                                          #endif
5215f80ce2aSJacob Faibussowitsch         CHKERRCUDA(WaitForCUDA());
5225f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
523aa372e3fSPaul Mullowney 
524da79fbbcSStefano Zampini         /* assign the pointer */
525aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
5262cbc15d9SMark         loTriFactor->AA_h = AALo;
5275f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFreeHost(AiLo));
5285f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFreeHost(AjLo));
5295f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar)));
530da79fbbcSStefano Zampini       } else { /* update values only */
5312cbc15d9SMark         if (!loTriFactor->AA_h) {
5325f80ce2aSJacob Faibussowitsch           CHKERRCUDA(cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar)));
5332cbc15d9SMark         }
534da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
5352cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
536da79fbbcSStefano Zampini         v        = aa;
537da79fbbcSStefano Zampini         vi       = aj;
538da79fbbcSStefano Zampini         offset   = 1;
539da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
540da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5415f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz));
542da79fbbcSStefano Zampini           offset      += nz;
5432cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
544da79fbbcSStefano Zampini           offset      += 1;
545da79fbbcSStefano Zampini           v  += nz;
546da79fbbcSStefano Zampini         }
5472cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
5485f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu(nzLower*sizeof(PetscScalar)));
549da79fbbcSStefano Zampini       }
5509ae82921SPaul Mullowney     } catch(char *ex) {
55198921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5529ae82921SPaul Mullowney     }
5539ae82921SPaul Mullowney   }
5549ae82921SPaul Mullowney   PetscFunctionReturn(0);
5559ae82921SPaul Mullowney }
5569ae82921SPaul Mullowney 
557087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5589ae82921SPaul Mullowney {
5599ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5609ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5619ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
562aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5639ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5649ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5659ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5669ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5679ae82921SPaul Mullowney 
5689ae82921SPaul Mullowney   PetscFunctionBegin;
569cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
570c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5719ae82921SPaul Mullowney     try {
5729ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5739ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
574da79fbbcSStefano Zampini       if (!upTriFactor) {
5752cbc15d9SMark         PetscScalar *AAUp;
5762cbc15d9SMark 
5775f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
5782cbc15d9SMark 
5799ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
5805f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
5815f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
5829ae82921SPaul Mullowney 
5839ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5849ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5859ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5869ae82921SPaul Mullowney         offset = nzUpper;
5879ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5889ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5899ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5909ae82921SPaul Mullowney 
591e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5929ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5939ae82921SPaul Mullowney 
594e057df02SPaul Mullowney           /* decrement the offset */
5959ae82921SPaul Mullowney           offset -= (nz+1);
5969ae82921SPaul Mullowney 
597e057df02SPaul Mullowney           /* first, set the diagonal elements */
5989ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
59909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
6009ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
6019ae82921SPaul Mullowney 
6025f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscArraycpy(&(AjUp[offset+1]), vi, nz));
6035f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscArraycpy(&(AAUp[offset+1]), v, nz));
6049ae82921SPaul Mullowney         }
6052205254eSKarl Rupp 
606aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
6075f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscNew(&upTriFactor));
608da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
6092205254eSKarl Rupp 
610aa372e3fSPaul Mullowney         /* Create the matrix description */
6115f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
6125f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
6131b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
6145f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
615afb2bd1cSJunchao Zhang        #else
6165f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
617afb2bd1cSJunchao Zhang        #endif
6185f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
6195f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
620aa372e3fSPaul Mullowney 
621aa372e3fSPaul Mullowney         /* set the operation */
622aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
623aa372e3fSPaul Mullowney 
624aa372e3fSPaul Mullowney         /* set the matrix */
625aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
626aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
627aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
628aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
629aa372e3fSPaul Mullowney 
630aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
631aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
632aa372e3fSPaul Mullowney 
633aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
634aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
635aa372e3fSPaul Mullowney 
636aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
637aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
638aa372e3fSPaul Mullowney 
639afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
6405f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
6415f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
6421b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
6435f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
644afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
645afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
646afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
6475f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
6485f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
649afb2bd1cSJunchao Zhang       #endif
650afb2bd1cSJunchao Zhang 
651aa372e3fSPaul Mullowney         /* perform the solve analysis */
6525f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
653aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
654aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
655d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
6561b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
657d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
6585f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
659d49cd2b7SBarry Smith                                          #else
6605f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
661afb2bd1cSJunchao Zhang                                          #endif
6625f80ce2aSJacob Faibussowitsch         CHKERRCUDA(WaitForCUDA());
6635f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
664aa372e3fSPaul Mullowney 
665da79fbbcSStefano Zampini         /* assign the pointer */
666aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6672cbc15d9SMark         upTriFactor->AA_h = AAUp;
6685f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFreeHost(AiUp));
6695f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFreeHost(AjUp));
6705f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar)));
671da79fbbcSStefano Zampini       } else {
6722cbc15d9SMark         if (!upTriFactor->AA_h) {
6735f80ce2aSJacob Faibussowitsch           CHKERRCUDA(cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar)));
6742cbc15d9SMark         }
675da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
676da79fbbcSStefano Zampini         offset = nzUpper;
677da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
678da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
679da79fbbcSStefano Zampini 
680da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
681da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
682da79fbbcSStefano Zampini 
683da79fbbcSStefano Zampini           /* decrement the offset */
684da79fbbcSStefano Zampini           offset -= (nz+1);
685da79fbbcSStefano Zampini 
686da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6872cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6885f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz));
689da79fbbcSStefano Zampini         }
6902cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
6915f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar)));
692da79fbbcSStefano Zampini       }
6939ae82921SPaul Mullowney     } catch(char *ex) {
69498921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6959ae82921SPaul Mullowney     }
6969ae82921SPaul Mullowney   }
6979ae82921SPaul Mullowney   PetscFunctionReturn(0);
6989ae82921SPaul Mullowney }
6999ae82921SPaul Mullowney 
700087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
7019ae82921SPaul Mullowney {
7029ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
7039ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7049ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
7059ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
7069ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
7079ae82921SPaul Mullowney 
7089ae82921SPaul Mullowney   PetscFunctionBegin;
709*28b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
7105f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A));
7115f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A));
7122205254eSKarl Rupp 
713da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
714aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
7159ae82921SPaul Mullowney 
716c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
717e057df02SPaul Mullowney   /* lower triangular indices */
7185f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(isrow,&row_identity));
719da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
720da79fbbcSStefano Zampini     const PetscInt *r;
721da79fbbcSStefano Zampini 
7225f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetIndices(isrow,&r));
723aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
724aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
7255f80ce2aSJacob Faibussowitsch     CHKERRQ(ISRestoreIndices(isrow,&r));
7265f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
727da79fbbcSStefano Zampini   }
7289ae82921SPaul Mullowney 
729e057df02SPaul Mullowney   /* upper triangular indices */
7305f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(iscol,&col_identity));
731da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
732da79fbbcSStefano Zampini     const PetscInt *c;
733da79fbbcSStefano Zampini 
7345f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetIndices(iscol,&c));
735aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
736aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
7375f80ce2aSJacob Faibussowitsch     CHKERRQ(ISRestoreIndices(iscol,&c));
7385f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
739da79fbbcSStefano Zampini   }
7409ae82921SPaul Mullowney   PetscFunctionReturn(0);
7419ae82921SPaul Mullowney }
7429ae82921SPaul Mullowney 
743087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
744087f3262SPaul Mullowney {
745087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
746087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
747aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
748aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
749087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
750087f3262SPaul Mullowney   PetscScalar                       *AAUp;
751087f3262SPaul Mullowney   PetscScalar                       *AALo;
752087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
753087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
754087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
755087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
756087f3262SPaul Mullowney 
757087f3262SPaul Mullowney   PetscFunctionBegin;
758cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
759c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
760087f3262SPaul Mullowney     try {
7615f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar)));
7625f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar)));
763da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
764087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
7655f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt)));
7665f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt)));
767087f3262SPaul Mullowney 
768087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
769087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
770087f3262SPaul Mullowney         AiUp[n]=nzUpper;
771087f3262SPaul Mullowney         offset = 0;
772087f3262SPaul Mullowney         for (i=0; i<n; i++) {
773087f3262SPaul Mullowney           /* set the pointers */
774087f3262SPaul Mullowney           v  = aa + ai[i];
775087f3262SPaul Mullowney           vj = aj + ai[i];
776087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
777087f3262SPaul Mullowney 
778087f3262SPaul Mullowney           /* first, set the diagonal elements */
779087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
78009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
781087f3262SPaul Mullowney           AiUp[i]      = offset;
78209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
783087f3262SPaul Mullowney 
784087f3262SPaul Mullowney           offset+=1;
785087f3262SPaul Mullowney           if (nz>0) {
7865f80ce2aSJacob Faibussowitsch             CHKERRQ(PetscArraycpy(&(AjUp[offset]), vj, nz));
7875f80ce2aSJacob Faibussowitsch             CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz));
788087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
789087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
790087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
791087f3262SPaul Mullowney             }
792087f3262SPaul Mullowney             offset+=nz;
793087f3262SPaul Mullowney           }
794087f3262SPaul Mullowney         }
795087f3262SPaul Mullowney 
796aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
7975f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscNew(&upTriFactor));
798da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799087f3262SPaul Mullowney 
800aa372e3fSPaul Mullowney         /* Create the matrix description */
8015f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr));
8025f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8031b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8045f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
805afb2bd1cSJunchao Zhang        #else
8065f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
807afb2bd1cSJunchao Zhang        #endif
8085f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8095f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT));
810087f3262SPaul Mullowney 
811aa372e3fSPaul Mullowney         /* set the matrix */
812aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
813aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
814aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
815aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
816aa372e3fSPaul Mullowney 
817aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
818aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
819aa372e3fSPaul Mullowney 
820aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
821aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
822aa372e3fSPaul Mullowney 
823aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
824aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
825aa372e3fSPaul Mullowney 
826afb2bd1cSJunchao Zhang         /* set the operation */
827afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
828afb2bd1cSJunchao Zhang 
829afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8305f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
8315f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactor->solveInfo));
8321b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8335f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
834afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
835afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
836afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
8375f80ce2aSJacob Faibussowitsch                                                &upTriFactor->solveBufferSize));
8385f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize));
839afb2bd1cSJunchao Zhang       #endif
840afb2bd1cSJunchao Zhang 
841aa372e3fSPaul Mullowney         /* perform the solve analysis */
8425f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
843aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
844aa372e3fSPaul Mullowney                                          upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
845d49cd2b7SBarry Smith                                          upTriFactor->csrMat->column_indices->data().get(),
8461b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847d49cd2b7SBarry Smith                                          upTriFactor->solveInfo,
8485f80ce2aSJacob Faibussowitsch                                          upTriFactor->solvePolicy, upTriFactor->solveBuffer));
849d49cd2b7SBarry Smith                                          #else
8505f80ce2aSJacob Faibussowitsch                                          upTriFactor->solveInfo));
851afb2bd1cSJunchao Zhang                                          #endif
8525f80ce2aSJacob Faibussowitsch         CHKERRCUDA(WaitForCUDA());
8535f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
854aa372e3fSPaul Mullowney 
855da79fbbcSStefano Zampini         /* assign the pointer */
856aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
857aa372e3fSPaul Mullowney 
858aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
8595f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscNew(&loTriFactor));
860da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
861aa372e3fSPaul Mullowney 
862aa372e3fSPaul Mullowney         /* Create the matrix description */
8635f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr));
8645f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO));
8651b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8665f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
867afb2bd1cSJunchao Zhang        #else
8685f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR));
869afb2bd1cSJunchao Zhang        #endif
8705f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER));
8715f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
872aa372e3fSPaul Mullowney 
873aa372e3fSPaul Mullowney         /* set the operation */
874aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
875aa372e3fSPaul Mullowney 
876aa372e3fSPaul Mullowney         /* set the matrix */
877aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
878aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
879aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
880aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
881aa372e3fSPaul Mullowney 
882aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
883aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
884aa372e3fSPaul Mullowney 
885aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
886aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
887aa372e3fSPaul Mullowney 
888aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
889aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
890aa372e3fSPaul Mullowney 
891afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
8925f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
8935f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactor->solveInfo));
8941b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
8955f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
896afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
897afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
898afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
8995f80ce2aSJacob Faibussowitsch                                                &loTriFactor->solveBufferSize));
9005f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize));
901afb2bd1cSJunchao Zhang       #endif
902afb2bd1cSJunchao Zhang 
903aa372e3fSPaul Mullowney         /* perform the solve analysis */
9045f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
905aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
906aa372e3fSPaul Mullowney                                          loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
907d49cd2b7SBarry Smith                                          loTriFactor->csrMat->column_indices->data().get(),
9081b0a6780SStefano Zampini                                          #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
909d49cd2b7SBarry Smith                                          loTriFactor->solveInfo,
9105f80ce2aSJacob Faibussowitsch                                          loTriFactor->solvePolicy, loTriFactor->solveBuffer));
911d49cd2b7SBarry Smith                                          #else
9125f80ce2aSJacob Faibussowitsch                                          loTriFactor->solveInfo));
913afb2bd1cSJunchao Zhang                                          #endif
9145f80ce2aSJacob Faibussowitsch         CHKERRCUDA(WaitForCUDA());
9155f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
916aa372e3fSPaul Mullowney 
917da79fbbcSStefano Zampini         /* assign the pointer */
918aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
919087f3262SPaul Mullowney 
9205f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar))));
9215f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFreeHost(AiUp));
9225f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFreeHost(AjUp));
923da79fbbcSStefano Zampini       } else {
924da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
925da79fbbcSStefano Zampini         offset = 0;
926da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
927da79fbbcSStefano Zampini           /* set the pointers */
928da79fbbcSStefano Zampini           v  = aa + ai[i];
929da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
930da79fbbcSStefano Zampini 
931da79fbbcSStefano Zampini           /* first, set the diagonal elements */
932da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
933da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
934da79fbbcSStefano Zampini 
935da79fbbcSStefano Zampini           offset+=1;
936da79fbbcSStefano Zampini           if (nz>0) {
9375f80ce2aSJacob Faibussowitsch             CHKERRQ(PetscArraycpy(&(AAUp[offset]), v, nz));
938da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
939da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
940da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
941da79fbbcSStefano Zampini             }
942da79fbbcSStefano Zampini             offset+=nz;
943da79fbbcSStefano Zampini           }
944da79fbbcSStefano Zampini         }
945*28b400f6SJacob Faibussowitsch         PetscCheck(upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
946*28b400f6SJacob Faibussowitsch         PetscCheck(loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
947da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
948da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
9495f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar)));
950da79fbbcSStefano Zampini       }
9515f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFreeHost(AAUp));
9525f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFreeHost(AALo));
953087f3262SPaul Mullowney     } catch(char *ex) {
95498921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
955087f3262SPaul Mullowney     }
956087f3262SPaul Mullowney   }
957087f3262SPaul Mullowney   PetscFunctionReturn(0);
958087f3262SPaul Mullowney }
959087f3262SPaul Mullowney 
960087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9619ae82921SPaul Mullowney {
962087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
963087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
964087f3262SPaul Mullowney   IS                           ip = a->row;
965087f3262SPaul Mullowney   PetscBool                    perm_identity;
966087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
967087f3262SPaul Mullowney 
968087f3262SPaul Mullowney   PetscFunctionBegin;
969*28b400f6SJacob Faibussowitsch   PetscCheck(cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
9705f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEBuildICCTriMatrices(A));
971da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
972aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
973aa372e3fSPaul Mullowney 
974da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
975da79fbbcSStefano Zampini 
976087f3262SPaul Mullowney   /* lower triangular indices */
9775f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(ip,&perm_identity));
978087f3262SPaul Mullowney   if (!perm_identity) {
9794e4bbfaaSStefano Zampini     IS             iip;
980da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9814e4bbfaaSStefano Zampini 
9825f80ce2aSJacob Faibussowitsch     CHKERRQ(ISInvertPermutation(ip,PETSC_DECIDE,&iip));
9835f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetIndices(iip,&irip));
9845f80ce2aSJacob Faibussowitsch     CHKERRQ(ISGetIndices(ip,&rip));
985aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
986aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
987aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9884e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9895f80ce2aSJacob Faibussowitsch     CHKERRQ(ISRestoreIndices(iip,&irip));
9905f80ce2aSJacob Faibussowitsch     CHKERRQ(ISDestroy(&iip));
9915f80ce2aSJacob Faibussowitsch     CHKERRQ(ISRestoreIndices(ip,&rip));
9925f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
993da79fbbcSStefano Zampini   }
994087f3262SPaul Mullowney   PetscFunctionReturn(0);
995087f3262SPaul Mullowney }
996087f3262SPaul Mullowney 
997087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
998087f3262SPaul Mullowney {
999087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1000087f3262SPaul Mullowney   IS             ip = b->row;
1001087f3262SPaul Mullowney   PetscBool      perm_identity;
1002087f3262SPaul Mullowney 
1003087f3262SPaul Mullowney   PetscFunctionBegin;
10045f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
10055f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCholeskyFactorNumeric_SeqAIJ(B,A,info));
1006ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1007087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
10085f80ce2aSJacob Faibussowitsch   CHKERRQ(ISIdentity(ip,&perm_identity));
1009087f3262SPaul Mullowney   if (perm_identity) {
1010087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1011087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
10124e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10134e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1014087f3262SPaul Mullowney   } else {
1015087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1016087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
10174e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10184e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1019087f3262SPaul Mullowney   }
1020087f3262SPaul Mullowney 
1021087f3262SPaul Mullowney   /* get the triangular factors */
10225f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B));
1023087f3262SPaul Mullowney   PetscFunctionReturn(0);
1024087f3262SPaul Mullowney }
10259ae82921SPaul Mullowney 
1026b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1027bda325fcSPaul Mullowney {
1028bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1029aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1030aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1031da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1032da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1033aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1034aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1035aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1036aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
1037b175d8bbSPaul Mullowney 
1038bda325fcSPaul Mullowney   PetscFunctionBegin;
1039aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
10405f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&loTriFactorT));
1041da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1042aa372e3fSPaul Mullowney 
1043aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1044aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1045aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1046aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1047aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1048aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1049aa372e3fSPaul Mullowney 
1050aa372e3fSPaul Mullowney   /* Create the matrix description */
10515f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr));
10525f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase));
10535f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType));
10545f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode));
10555f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType));
1056aa372e3fSPaul Mullowney 
1057aa372e3fSPaul Mullowney   /* set the operation */
1058aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1059aa372e3fSPaul Mullowney 
1060aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1061aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1062afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1063afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1064aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1065afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1066afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1067afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1068aa372e3fSPaul Mullowney 
1069aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1070afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
10715f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1072afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1073afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->values->data().get(),
1074afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->row_offsets->data().get(),
1075afb2bd1cSJunchao Zhang                                                loTriFactor->csrMat->column_indices->data().get(),
1076afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->values->data().get(),
1077afb2bd1cSJunchao Zhang                                                loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1078afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
10795f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize));
10805f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize));
1081afb2bd1cSJunchao Zhang #endif
1082afb2bd1cSJunchao Zhang 
10835f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
10845f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1085aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1086aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->values->data().get(),
1087aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->row_offsets->data().get(),
1088aa372e3fSPaul Mullowney                                   loTriFactor->csrMat->column_indices->data().get(),
1089aa372e3fSPaul Mullowney                                   loTriFactorT->csrMat->values->data().get(),
1090afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1091afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1092afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
10935f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer));
1094afb2bd1cSJunchao Zhang                                   #else
1095afb2bd1cSJunchao Zhang                                   loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
10965f80ce2aSJacob Faibussowitsch                                   CUSPARSE_ACTION_NUMERIC, indexBase));
1097afb2bd1cSJunchao Zhang                                   #endif
10985f80ce2aSJacob Faibussowitsch   CHKERRCUDA(WaitForCUDA());
10995f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1100aa372e3fSPaul Mullowney 
1101afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11025f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
11035f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_create_analysis_info(&loTriFactorT->solveInfo));
11041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
11055f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1106afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1107afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1108afb2bd1cSJunchao Zhang                                          loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
11095f80ce2aSJacob Faibussowitsch                                          &loTriFactorT->solveBufferSize));
11105f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize));
1111afb2bd1cSJunchao Zhang #endif
1112afb2bd1cSJunchao Zhang 
1113afb2bd1cSJunchao Zhang   /* perform the solve analysis */
11145f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1115afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1116afb2bd1cSJunchao Zhang                                    loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1117d49cd2b7SBarry Smith                                    loTriFactorT->csrMat->column_indices->data().get(),
11181b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1119d49cd2b7SBarry Smith                                    loTriFactorT->solveInfo,
11205f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solvePolicy, loTriFactorT->solveBuffer));
1121d49cd2b7SBarry Smith                                    #else
11225f80ce2aSJacob Faibussowitsch                                    loTriFactorT->solveInfo));
1123afb2bd1cSJunchao Zhang                                    #endif
11245f80ce2aSJacob Faibussowitsch   CHKERRCUDA(WaitForCUDA());
11255f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1126aa372e3fSPaul Mullowney 
1127da79fbbcSStefano Zampini   /* assign the pointer */
1128aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1129aa372e3fSPaul Mullowney 
1130aa372e3fSPaul Mullowney   /*********************************************/
1131aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1132aa372e3fSPaul Mullowney   /*********************************************/
1133aa372e3fSPaul Mullowney 
1134aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
11355f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&upTriFactorT));
1136da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1137aa372e3fSPaul Mullowney 
1138aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1139aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1140aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1141aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1142aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1143aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1144aa372e3fSPaul Mullowney 
1145aa372e3fSPaul Mullowney   /* Create the matrix description */
11465f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr));
11475f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase));
11485f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType));
11495f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode));
11505f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType));
1151aa372e3fSPaul Mullowney 
1152aa372e3fSPaul Mullowney   /* set the operation */
1153aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1154aa372e3fSPaul Mullowney 
1155aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1156aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1157afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1158afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1159aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1160afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1161afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1162afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1163aa372e3fSPaul Mullowney 
1164aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1165afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
11665f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1167afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1168afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->values->data().get(),
1169afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->row_offsets->data().get(),
1170afb2bd1cSJunchao Zhang                                                upTriFactor->csrMat->column_indices->data().get(),
1171afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->values->data().get(),
1172afb2bd1cSJunchao Zhang                                                upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1173afb2bd1cSJunchao Zhang                                                CUSPARSE_ACTION_NUMERIC,indexBase,
11745f80ce2aSJacob Faibussowitsch                                                CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize));
11755f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize));
1176afb2bd1cSJunchao Zhang #endif
1177afb2bd1cSJunchao Zhang 
11785f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
11795f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1180aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1181aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->values->data().get(),
1182aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->row_offsets->data().get(),
1183aa372e3fSPaul Mullowney                                   upTriFactor->csrMat->column_indices->data().get(),
1184aa372e3fSPaul Mullowney                                   upTriFactorT->csrMat->values->data().get(),
1185afb2bd1cSJunchao Zhang                                   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1186afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1187afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC, indexBase,
11885f80ce2aSJacob Faibussowitsch                                   CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer));
1189afb2bd1cSJunchao Zhang                                   #else
1190afb2bd1cSJunchao Zhang                                   upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
11915f80ce2aSJacob Faibussowitsch                                  CUSPARSE_ACTION_NUMERIC, indexBase));
1192afb2bd1cSJunchao Zhang                                  #endif
1193d49cd2b7SBarry Smith 
11945f80ce2aSJacob Faibussowitsch   CHKERRCUDA(WaitForCUDA());
11955f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1196aa372e3fSPaul Mullowney 
1197afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
11985f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0));
11995f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_create_analysis_info(&upTriFactorT->solveInfo));
12001b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
12015f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1202afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1203afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1204afb2bd1cSJunchao Zhang                                          upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
12055f80ce2aSJacob Faibussowitsch                                          &upTriFactorT->solveBufferSize));
12065f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize));
1207afb2bd1cSJunchao Zhang   #endif
1208afb2bd1cSJunchao Zhang 
1209afb2bd1cSJunchao Zhang   /* perform the solve analysis */
12105f80ce2aSJacob Faibussowitsch   /* christ, would it have killed you to put this stuff in a function????????? */
12115f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1212afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1213afb2bd1cSJunchao Zhang                                    upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1214d49cd2b7SBarry Smith                                    upTriFactorT->csrMat->column_indices->data().get(),
12151b0a6780SStefano Zampini                                    #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1216d49cd2b7SBarry Smith                                    upTriFactorT->solveInfo,
12175f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solvePolicy, upTriFactorT->solveBuffer));
1218d49cd2b7SBarry Smith                                    #else
12195f80ce2aSJacob Faibussowitsch                                    upTriFactorT->solveInfo));
1220afb2bd1cSJunchao Zhang                                    #endif
1221d49cd2b7SBarry Smith 
12225f80ce2aSJacob Faibussowitsch   CHKERRCUDA(WaitForCUDA());
12235f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0));
1224aa372e3fSPaul Mullowney 
1225da79fbbcSStefano Zampini   /* assign the pointer */
1226aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1227bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1228bda325fcSPaul Mullowney }
1229bda325fcSPaul Mullowney 
1230a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1231a49f1ed0SStefano Zampini {
1232a49f1ed0SStefano Zampini   __host__ __device__
1233a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1234a49f1ed0SStefano Zampini   {
1235a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1236a49f1ed0SStefano Zampini   }
1237a49f1ed0SStefano Zampini };
1238a49f1ed0SStefano Zampini 
12393606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1240bda325fcSPaul Mullowney {
1241aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1242a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1243bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1244bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1245aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1246b175d8bbSPaul Mullowney 
1247bda325fcSPaul Mullowney   PetscFunctionBegin;
12485f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
1249a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1250*28b400f6SJacob Faibussowitsch   PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1251a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
12522c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12531a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
12545f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
12555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1256a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
12575f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
1258a49f1ed0SStefano Zampini   }
1259a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1260aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
12615f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseCreateMatDescr(&matstructT->descr));
1262aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
12635f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase));
12645f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
1265aa372e3fSPaul Mullowney 
1266b06137fdSPaul Mullowney     /* set alpha and beta */
12675f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar)));
12685f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar)));
12695f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar)));
12705f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
12715f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
12725f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
1273b06137fdSPaul Mullowney 
1274aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1275aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1276a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1277554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1278554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1279aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1280a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1281aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1282aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1283a3fdcf43SKarl Rupp 
1284039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
128581902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1286afb2bd1cSJunchao Zhang 
1287afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
12883606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1289afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1290afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1291afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1292afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1293afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1294afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
12953606e59fSJunchao Zhang       #else
12963606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
12973606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
12983606e59fSJunchao Zhang 
12993606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
13003606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
13013606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
13023606e59fSJunchao Zhang         */
13033606e59fSJunchao Zhang         if (matrixT->num_entries) {
13043606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
13053606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
13063606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
13073606e59fSJunchao Zhang                                  matrixT->values->data().get(),
13083606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
13093606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13103606e59fSJunchao Zhang 
13113606e59fSJunchao Zhang         } else {
13123606e59fSJunchao Zhang           matstructT->matDescr = NULL;
13133606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13143606e59fSJunchao Zhang         }
13153606e59fSJunchao Zhang       #endif
1316afb2bd1cSJunchao Zhang      #endif
1317aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1318afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1319afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1320afb2bd1cSJunchao Zhang    #else
1321aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
132251c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
132351c6d536SStefano Zampini       /* First convert HYB to CSR */
1324aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1325aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1326aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1327aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1328aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1329aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1330aa372e3fSPaul Mullowney 
1331aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1332aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1333aa372e3fSPaul Mullowney                               temp->values->data().get(),
1334aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
133557d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1336aa372e3fSPaul Mullowney 
1337aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1338aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1339aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1340aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1341aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1342aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1343aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1344aa372e3fSPaul Mullowney 
1345aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1346aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1347aa372e3fSPaul Mullowney                               temp->values->data().get(),
1348aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1349aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1350aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1351aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1352aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
135357d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1354aa372e3fSPaul Mullowney 
1355aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1356aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
13575f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat));
1358aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1359aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1360aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1361aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1362aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1363aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
136457d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1365aa372e3fSPaul Mullowney 
1366aa372e3fSPaul Mullowney       /* assign the pointer */
1367aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13681a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1369aa372e3fSPaul Mullowney       /* delete temporaries */
1370aa372e3fSPaul Mullowney       if (tempT) {
1371aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1372aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1373aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1374aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1375087f3262SPaul Mullowney       }
1376aa372e3fSPaul Mullowney       if (temp) {
1377aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1378aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1379aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1380aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1381aa372e3fSPaul Mullowney       }
1382afb2bd1cSJunchao Zhang      #endif
1383aa372e3fSPaul Mullowney     }
1384a49f1ed0SStefano Zampini   }
1385a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1386a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1387a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1388*28b400f6SJacob Faibussowitsch     PetscCheck(matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1389*28b400f6SJacob Faibussowitsch     PetscCheck(matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1390*28b400f6SJacob Faibussowitsch     PetscCheck(matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1391*28b400f6SJacob Faibussowitsch     PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1392*28b400f6SJacob Faibussowitsch     PetscCheck(matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1393*28b400f6SJacob Faibussowitsch     PetscCheck(matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1394*28b400f6SJacob Faibussowitsch     PetscCheck(matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1395*28b400f6SJacob Faibussowitsch     PetscCheck(matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1396a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1397a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1398a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
13995f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
1400a49f1ed0SStefano Zampini     }
1401a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1402a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1403a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1404a49f1ed0SStefano Zampini 
1405a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1406a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1407a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1408a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1409a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1410a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1411a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1412a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1413a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1414a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1415a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1416a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1417a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
14185f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMalloc(&csr2cscBuffer,csr2cscBufferSize));
1419a49f1ed0SStefano Zampini      #endif
1420a49f1ed0SStefano Zampini 
14211a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
14221a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
14231a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
14241a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
14251a2c6b5cSJunchao Zhang 
14261a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
14271a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
14281a2c6b5cSJunchao Zhang         */
14291a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14301a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14311a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14321a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14331a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1434a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1435a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1436a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1437a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14381a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1439a49f1ed0SStefano Zampini                              #else
1440a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14411a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1442a49f1ed0SStefano Zampini                              #endif
14431a2c6b5cSJunchao Zhang       } else {
14441a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14451a2c6b5cSJunchao Zhang       }
14461a2c6b5cSJunchao Zhang 
1447a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1448a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1449a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
14505f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFree(csr2cscBuffer));
1451a49f1ed0SStefano Zampini      #endif
1452a49f1ed0SStefano Zampini     }
1453a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1454a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1455a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1456a49f1ed0SStefano Zampini   }
14575f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
14585f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0));
1459213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1460213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1461aa372e3fSPaul Mullowney   /* assign the pointer */
1462aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14631a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1464bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1465bda325fcSPaul Mullowney }
1466bda325fcSPaul Mullowney 
1467a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14686fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1469bda325fcSPaul Mullowney {
1470c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1471465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1472465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1473465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1474465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1475bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1476bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1477aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1478aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1479aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1480bda325fcSPaul Mullowney 
1481bda325fcSPaul Mullowney   PetscFunctionBegin;
1482aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1483aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
14845f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1485aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1486aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1487bda325fcSPaul Mullowney   }
1488bda325fcSPaul Mullowney 
1489bda325fcSPaul Mullowney   /* Get the GPU pointers */
14905f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
14915f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1492c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1493c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1494bda325fcSPaul Mullowney 
14955f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1496aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1497a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1498c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1499c41cb2e2SAlejandro Lamas Daviña                xGPU);
1500aa372e3fSPaul Mullowney 
1501aa372e3fSPaul Mullowney   /* First, solve U */
1502aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1503afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15041b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1505afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1506afb2bd1cSJunchao Zhang                       #endif
1507afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1508aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1509aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1510aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1511aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1512d49cd2b7SBarry Smith                         xarray,
15131b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1514d49cd2b7SBarry Smith                         tempGPU->data().get(),
1515d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1516d49cd2b7SBarry Smith                       #else
1517d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1518afb2bd1cSJunchao Zhang                       #endif
1519aa372e3fSPaul Mullowney 
1520aa372e3fSPaul Mullowney   /* Then, solve L */
1521aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1522afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15231b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1524afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1525afb2bd1cSJunchao Zhang                       #endif
1526afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1527aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1528aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1529aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1530aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1531d49cd2b7SBarry Smith                         tempGPU->data().get(),
15321b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1533d49cd2b7SBarry Smith                         xarray,
1534d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1535d49cd2b7SBarry Smith                       #else
1536d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1537afb2bd1cSJunchao Zhang                       #endif
1538aa372e3fSPaul Mullowney 
1539aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1540a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1541c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1542aa372e3fSPaul Mullowney                tempGPU->begin());
1543aa372e3fSPaul Mullowney 
1544aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1545a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1546bda325fcSPaul Mullowney 
1547bda325fcSPaul Mullowney   /* restore */
15485f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
15495f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
15505f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
15515f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1552bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1553bda325fcSPaul Mullowney }
1554bda325fcSPaul Mullowney 
15556fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1556bda325fcSPaul Mullowney {
1557465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1558465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1559bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1560bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1561aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1562aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1563aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1564bda325fcSPaul Mullowney 
1565bda325fcSPaul Mullowney   PetscFunctionBegin;
1566aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1567aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
15685f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A));
1569aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1570aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1571bda325fcSPaul Mullowney   }
1572bda325fcSPaul Mullowney 
1573bda325fcSPaul Mullowney   /* Get the GPU pointers */
15745f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
15755f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1576bda325fcSPaul Mullowney 
15775f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1578aa372e3fSPaul Mullowney   /* First, solve U */
1579aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1580afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15811b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1582afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1583afb2bd1cSJunchao Zhang                       #endif
1584afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1585aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1586aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1587aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1588aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1589d49cd2b7SBarry Smith                         barray,
15901b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591d49cd2b7SBarry Smith                         tempGPU->data().get(),
1592d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1593d49cd2b7SBarry Smith                       #else
1594d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1595afb2bd1cSJunchao Zhang                       #endif
1596aa372e3fSPaul Mullowney 
1597aa372e3fSPaul Mullowney   /* Then, solve L */
1598aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1599afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
16001b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1601afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1602afb2bd1cSJunchao Zhang                       #endif
1603afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1604aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1605aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1606aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1607aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1608d49cd2b7SBarry Smith                         tempGPU->data().get(),
16091b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610d49cd2b7SBarry Smith                         xarray,
1611d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1612d49cd2b7SBarry Smith                       #else
1613d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1614afb2bd1cSJunchao Zhang                       #endif
1615bda325fcSPaul Mullowney 
1616bda325fcSPaul Mullowney   /* restore */
16175f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
16185f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
16195f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
16205f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
1621bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1622bda325fcSPaul Mullowney }
1623bda325fcSPaul Mullowney 
16246fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
16259ae82921SPaul Mullowney {
1626465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1627465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1628465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1629465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16309ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16319ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1632aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1633aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1634aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
16359ae82921SPaul Mullowney 
16369ae82921SPaul Mullowney   PetscFunctionBegin;
1637ebc8f436SDominic Meiser 
1638e057df02SPaul Mullowney   /* Get the GPU pointers */
16395f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
16405f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
1641c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1642c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16439ae82921SPaul Mullowney 
16445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1645aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1646a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1647c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16484e4bbfaaSStefano Zampini                tempGPU->begin());
1649aa372e3fSPaul Mullowney 
1650aa372e3fSPaul Mullowney   /* Next, solve L */
1651aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1652afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16531b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1654afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1655afb2bd1cSJunchao Zhang                       #endif
1656afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1657aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1658aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1659aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1660aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1661d49cd2b7SBarry Smith                         tempGPU->data().get(),
16621b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1663d49cd2b7SBarry Smith                          xarray,
1664d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1665d49cd2b7SBarry Smith                       #else
1666d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1667afb2bd1cSJunchao Zhang                       #endif
1668aa372e3fSPaul Mullowney 
1669aa372e3fSPaul Mullowney   /* Then, solve U */
1670aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1671afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16721b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1674afb2bd1cSJunchao Zhang                       #endif
1675afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1676aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1677aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1678aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1679d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
16801b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1681d49cd2b7SBarry Smith                         tempGPU->data().get(),
1682d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1683d49cd2b7SBarry Smith                       #else
1684d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1685afb2bd1cSJunchao Zhang                       #endif
1686d49cd2b7SBarry Smith 
16874e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1688a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16894e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16904e4bbfaaSStefano Zampini                xGPU);
16919ae82921SPaul Mullowney 
16925f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
16935f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
16945f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
16955f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
16969ae82921SPaul Mullowney   PetscFunctionReturn(0);
16979ae82921SPaul Mullowney }
16989ae82921SPaul Mullowney 
16996fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
17009ae82921SPaul Mullowney {
1701465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1702465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
17039ae82921SPaul Mullowney   cusparseStatus_t                  stat;
17049ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1705aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1706aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1707aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
17089ae82921SPaul Mullowney 
17099ae82921SPaul Mullowney   PetscFunctionBegin;
1710e057df02SPaul Mullowney   /* Get the GPU pointers */
17115f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayWrite(xx,&xarray));
17125f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDAGetArrayRead(bb,&barray));
17139ae82921SPaul Mullowney 
17145f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1715aa372e3fSPaul Mullowney   /* First, solve L */
1716aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1717afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
17181b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1719afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1720afb2bd1cSJunchao Zhang                       #endif
1721afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1722aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1723aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1724aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1725aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1726d49cd2b7SBarry Smith                         barray,
17271b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1728d49cd2b7SBarry Smith                         tempGPU->data().get(),
1729d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1730d49cd2b7SBarry Smith                       #else
1731d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1732afb2bd1cSJunchao Zhang                       #endif
1733d49cd2b7SBarry Smith 
1734aa372e3fSPaul Mullowney   /* Next, solve U */
1735aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1736afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17371b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1738afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1739afb2bd1cSJunchao Zhang                       #endif
1740afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1741aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1742aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1743aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1744aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1745d49cd2b7SBarry Smith                         tempGPU->data().get(),
17461b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1747d49cd2b7SBarry Smith                         xarray,
1748d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1749d49cd2b7SBarry Smith                       #else
1750d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1751afb2bd1cSJunchao Zhang                       #endif
17529ae82921SPaul Mullowney 
17535f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayRead(bb,&barray));
17545f80ce2aSJacob Faibussowitsch   CHKERRQ(VecCUDARestoreArrayWrite(xx,&xarray));
17555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
17565f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n));
17579ae82921SPaul Mullowney   PetscFunctionReturn(0);
17589ae82921SPaul Mullowney }
17599ae82921SPaul Mullowney 
17607e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17617e8381f9SStefano Zampini {
17627e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17637e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17647e8381f9SStefano Zampini 
17657e8381f9SStefano Zampini   PetscFunctionBegin;
17667e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17677e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17687e8381f9SStefano Zampini 
17695f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17705f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost));
17715f80ce2aSJacob Faibussowitsch     CHKERRCUDA(WaitForCUDA());
17725f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuToCpu(a->nz*sizeof(PetscScalar)));
17735f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0));
17747e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17757e8381f9SStefano Zampini   }
17767e8381f9SStefano Zampini   PetscFunctionReturn(0);
17777e8381f9SStefano Zampini }
17787e8381f9SStefano Zampini 
17797e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17807e8381f9SStefano Zampini {
17817e8381f9SStefano Zampini   PetscFunctionBegin;
17825f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
178367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
178467a45760SJunchao Zhang   PetscFunctionReturn(0);
178567a45760SJunchao Zhang }
178667a45760SJunchao Zhang 
178767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
178867a45760SJunchao Zhang {
178967a45760SJunchao Zhang   PetscFunctionBegin;
17907e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
179167a45760SJunchao Zhang   *array         = NULL;
179267a45760SJunchao Zhang   PetscFunctionReturn(0);
179367a45760SJunchao Zhang }
179467a45760SJunchao Zhang 
179567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
179667a45760SJunchao Zhang {
179767a45760SJunchao Zhang   PetscFunctionBegin;
17985f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
179967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
180067a45760SJunchao Zhang   PetscFunctionReturn(0);
180167a45760SJunchao Zhang }
180267a45760SJunchao Zhang 
180367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
180467a45760SJunchao Zhang {
180567a45760SJunchao Zhang   PetscFunctionBegin;
180667a45760SJunchao Zhang   *array = NULL;
180767a45760SJunchao Zhang   PetscFunctionReturn(0);
180867a45760SJunchao Zhang }
180967a45760SJunchao Zhang 
181067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
181167a45760SJunchao Zhang {
181267a45760SJunchao Zhang   PetscFunctionBegin;
181367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
181467a45760SJunchao Zhang   PetscFunctionReturn(0);
181567a45760SJunchao Zhang }
181667a45760SJunchao Zhang 
181767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
181867a45760SJunchao Zhang {
181967a45760SJunchao Zhang   PetscFunctionBegin;
182067a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
182167a45760SJunchao Zhang   *array         = NULL;
18227e8381f9SStefano Zampini   PetscFunctionReturn(0);
18237e8381f9SStefano Zampini }
18247e8381f9SStefano Zampini 
1825042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18269ae82921SPaul Mullowney {
1827aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18287c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18299ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1830213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
1831aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1832abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
18339ae82921SPaul Mullowney 
18349ae82921SPaul Mullowney   PetscFunctionBegin;
1835*28b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1836c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1837a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1838a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1839afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
184085ba7357SStefano Zampini 
18412c71b3e2SJacob Faibussowitsch       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
18425f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
1843afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
18445f80ce2aSJacob Faibussowitsch       CHKERRCUDA(WaitForCUDA());
18455f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar)));
18465f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
18475f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
184834d6c7a5SJose E. Roman     } else {
1849abb89eb1SStefano Zampini       PetscInt nnz;
18505f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0));
18515f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format));
18525f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
18537c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
185481902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1855a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1856a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18579ae82921SPaul Mullowney       try {
18589ae82921SPaul Mullowney         if (a->compressedrow.use) {
18599ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
18609ae82921SPaul Mullowney           ii   = a->compressedrow.i;
18619ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
18629ae82921SPaul Mullowney         } else {
1863213423ffSJunchao Zhang           m    = A->rmap->n;
1864213423ffSJunchao Zhang           ii   = a->i;
1865e6e9a74fSStefano Zampini           ridx = NULL;
18669ae82921SPaul Mullowney         }
18672c71b3e2SJacob Faibussowitsch         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1868abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1869abb89eb1SStefano Zampini         else nnz = a->nz;
1870cbc6b225SStefano Zampini         PetscCheckFalse(nnz && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
18719ae82921SPaul Mullowney 
187285ba7357SStefano Zampini         /* create cusparse matrix */
1873abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1874aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
18755f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateMatDescr(&matstruct->descr));
18765f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO));
18775f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
18789ae82921SPaul Mullowney 
18795f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar)));
18805f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar)));
18815f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar)));
18825f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18835f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
18845f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
18855f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE));
1886b06137fdSPaul Mullowney 
1887aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1888aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1889aa372e3fSPaul Mullowney           /* set the matrix */
1890afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1891afb2bd1cSJunchao Zhang           mat->num_rows = m;
1892afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1893abb89eb1SStefano Zampini           mat->num_entries = nnz;
1894afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1895afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18969ae82921SPaul Mullowney 
1897abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1898abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1899aa372e3fSPaul Mullowney 
1900abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1901abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1902aa372e3fSPaul Mullowney 
1903aa372e3fSPaul Mullowney           /* assign the pointer */
1904afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1905afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1906afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1907afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1908afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1909afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1910afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1911afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1912afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1913afb2bd1cSJunchao Zhang           }
1914afb2bd1cSJunchao Zhang          #endif
1915aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1916afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1917afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1918afb2bd1cSJunchao Zhang          #else
1919afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1920afb2bd1cSJunchao Zhang           mat->num_rows = m;
1921afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1922abb89eb1SStefano Zampini           mat->num_entries = nnz;
1923afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1924afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1925aa372e3fSPaul Mullowney 
1926abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1927abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1928aa372e3fSPaul Mullowney 
1929abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1930abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1931aa372e3fSPaul Mullowney 
1932aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
19335f80ce2aSJacob Faibussowitsch           CHKERRCUSPARSE(cusparseCreateHybMat(&hybMat));
1934aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1935aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1936afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1937afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1938afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1939afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
194057d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1941aa372e3fSPaul Mullowney           /* assign the pointer */
1942aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1943aa372e3fSPaul Mullowney 
1944afb2bd1cSJunchao Zhang           if (mat) {
1945afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1946afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1947afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1948afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1949087f3262SPaul Mullowney           }
1950afb2bd1cSJunchao Zhang          #endif
1951087f3262SPaul Mullowney         }
1952ca45077fSPaul Mullowney 
1953aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1954213423ffSJunchao Zhang         if (a->compressedrow.use) {
1955213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1956aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1957aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1958213423ffSJunchao Zhang           tmp = m;
1959213423ffSJunchao Zhang         } else {
1960213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1961213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1962213423ffSJunchao Zhang           tmp = 0;
1963213423ffSJunchao Zhang         }
19645f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar)));
1965aa372e3fSPaul Mullowney 
1966aa372e3fSPaul Mullowney         /* assign the pointer */
1967aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
19689ae82921SPaul Mullowney       } catch(char *ex) {
196998921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
19709ae82921SPaul Mullowney       }
19715f80ce2aSJacob Faibussowitsch       CHKERRCUDA(WaitForCUDA());
19725f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0));
197334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
197434d6c7a5SJose E. Roman     }
1975abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
19769ae82921SPaul Mullowney   }
19779ae82921SPaul Mullowney   PetscFunctionReturn(0);
19789ae82921SPaul Mullowney }
19799ae82921SPaul Mullowney 
1980c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1981aa372e3fSPaul Mullowney {
1982aa372e3fSPaul Mullowney   template <typename Tuple>
1983aa372e3fSPaul Mullowney   __host__ __device__
1984aa372e3fSPaul Mullowney   void operator()(Tuple t)
1985aa372e3fSPaul Mullowney   {
1986aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1987aa372e3fSPaul Mullowney   }
1988aa372e3fSPaul Mullowney };
1989aa372e3fSPaul Mullowney 
19907e8381f9SStefano Zampini struct VecCUDAEquals
19917e8381f9SStefano Zampini {
19927e8381f9SStefano Zampini   template <typename Tuple>
19937e8381f9SStefano Zampini   __host__ __device__
19947e8381f9SStefano Zampini   void operator()(Tuple t)
19957e8381f9SStefano Zampini   {
19967e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19977e8381f9SStefano Zampini   }
19987e8381f9SStefano Zampini };
19997e8381f9SStefano Zampini 
2000e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
2001e6e9a74fSStefano Zampini {
2002e6e9a74fSStefano Zampini   template <typename Tuple>
2003e6e9a74fSStefano Zampini   __host__ __device__
2004e6e9a74fSStefano Zampini   void operator()(Tuple t)
2005e6e9a74fSStefano Zampini   {
2006e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2007e6e9a74fSStefano Zampini   }
2008e6e9a74fSStefano Zampini };
2009e6e9a74fSStefano Zampini 
2010afb2bd1cSJunchao Zhang struct MatMatCusparse {
2011ccdfe979SStefano Zampini   PetscBool             cisdense;
2012ccdfe979SStefano Zampini   PetscScalar           *Bt;
2013ccdfe979SStefano Zampini   Mat                   X;
2014fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2015fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2016fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2017b4285af6SJunchao Zhang 
2018afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2019fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2020afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2021afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2022afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2023afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2024b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2025b4285af6SJunchao Zhang   void                  *dBuffer4;
2026b4285af6SJunchao Zhang   void                  *dBuffer5;
2027b4285af6SJunchao Zhang  #endif
2028fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2029fcdce8c4SStefano Zampini   void                  *mmBuffer;
2030fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2031fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2032afb2bd1cSJunchao Zhang #endif
2033afb2bd1cSJunchao Zhang };
2034ccdfe979SStefano Zampini 
2035ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2036ccdfe979SStefano Zampini {
2037ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
2038ccdfe979SStefano Zampini 
2039ccdfe979SStefano Zampini   PetscFunctionBegin;
20405f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(mmdata->Bt));
2041fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2042afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20435f80ce2aSJacob Faibussowitsch   if (mmdata->matSpBDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr));
20445f80ce2aSJacob Faibussowitsch   if (mmdata->matBDescr)   CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr));
20455f80ce2aSJacob Faibussowitsch   if (mmdata->matCDescr)   CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr));
20465f80ce2aSJacob Faibussowitsch   if (mmdata->spgemmDesc)  CHKERRCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc));
2047b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
20485f80ce2aSJacob Faibussowitsch   if (mmdata->dBuffer4)  CHKERRCUDA(cudaFree(mmdata->dBuffer4));
20495f80ce2aSJacob Faibussowitsch   if (mmdata->dBuffer5)  CHKERRCUDA(cudaFree(mmdata->dBuffer5));
2050b4285af6SJunchao Zhang  #endif
20515f80ce2aSJacob Faibussowitsch   if (mmdata->mmBuffer)  CHKERRCUDA(cudaFree(mmdata->mmBuffer));
20525f80ce2aSJacob Faibussowitsch   if (mmdata->mmBuffer2) CHKERRCUDA(cudaFree(mmdata->mmBuffer2));
2053afb2bd1cSJunchao Zhang  #endif
20545f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&mmdata->X));
20555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(data));
2056ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2057ccdfe979SStefano Zampini }
2058ccdfe979SStefano Zampini 
2059ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2060ccdfe979SStefano Zampini 
2061ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2062ccdfe979SStefano Zampini {
2063ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2064ccdfe979SStefano Zampini   Mat                          A,B;
2065afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2066ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2067ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2068ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2069ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2070ccdfe979SStefano Zampini   const PetscScalar            *barray;
2071ccdfe979SStefano Zampini   PetscScalar                  *carray;
2072ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2073ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2074ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2075ccdfe979SStefano Zampini 
2076ccdfe979SStefano Zampini   PetscFunctionBegin;
2077ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2078*28b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2079ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2080ccdfe979SStefano Zampini   A    = product->A;
2081ccdfe979SStefano Zampini   B    = product->B;
20825f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2083*28b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2084ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2085ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2086*28b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
20875f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
2088ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2089ccdfe979SStefano Zampini   switch (product->type) {
2090ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2091ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2092ccdfe979SStefano Zampini     mat = cusp->mat;
2093ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2094ccdfe979SStefano Zampini     m   = A->rmap->n;
2095ccdfe979SStefano Zampini     n   = B->cmap->n;
2096ccdfe979SStefano Zampini     break;
2097ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20981a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2099e6e9a74fSStefano Zampini       mat = cusp->mat;
2100e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2101e6e9a74fSStefano Zampini     } else {
21025f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2103ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2104ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2105e6e9a74fSStefano Zampini     }
2106ccdfe979SStefano Zampini     m = A->cmap->n;
2107ccdfe979SStefano Zampini     n = B->cmap->n;
2108ccdfe979SStefano Zampini     break;
2109ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2110ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2111ccdfe979SStefano Zampini     mat = cusp->mat;
2112ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2113ccdfe979SStefano Zampini     m   = A->rmap->n;
2114ccdfe979SStefano Zampini     n   = B->rmap->n;
2115ccdfe979SStefano Zampini     break;
2116ccdfe979SStefano Zampini   default:
211798921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2118ccdfe979SStefano Zampini   }
2119*28b400f6SJacob Faibussowitsch   PetscCheck(mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2120ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2121ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
21225f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda));
21235f80ce2aSJacob Faibussowitsch   if (!biscuda) CHKERRQ(MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B));
21245f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDenseCUDAGetArrayRead(B,&barray));
2125afb2bd1cSJunchao Zhang 
21265f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDenseGetLDA(B,&blda));
2127c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
21285f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDenseCUDAGetArrayWrite(mmdata->X,&carray));
21295f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDenseGetLDA(mmdata->X,&clda));
2130c8378d12SStefano Zampini   } else {
21315f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDenseCUDAGetArrayWrite(C,&carray));
21325f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDenseGetLDA(C,&clda));
2133c8378d12SStefano Zampini   }
2134c8378d12SStefano Zampini 
21355f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
2136afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2138a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2139afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2140fcdce8c4SStefano Zampini     size_t mmBufferSize;
21415f80ce2aSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Blda != blda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); mmdata->matBDescr = NULL;}
2142afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
21435f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2144afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2145afb2bd1cSJunchao Zhang     }
2146c8378d12SStefano Zampini 
21475f80ce2aSJacob Faibussowitsch     if (mmdata->initialized && mmdata->Clda != clda) {CHKERRCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); mmdata->matCDescr = NULL;}
2148afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
21495f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL));
2150afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2151afb2bd1cSJunchao Zhang     }
2152afb2bd1cSJunchao Zhang 
2153afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2154afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2155afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2156afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2157afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2158afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2159afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2160afb2bd1cSJunchao Zhang     }
2161afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2162afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2163afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2164fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2165fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
21665f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFree(mmdata->mmBuffer));
21675f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMalloc(&mmdata->mmBuffer,mmBufferSize));
2168fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2169fcdce8c4SStefano Zampini     }
2170afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2171afb2bd1cSJunchao Zhang   } else {
2172afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
21735f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get()));
21745f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray));
21755f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray));
2176afb2bd1cSJunchao Zhang   }
2177afb2bd1cSJunchao Zhang 
2178afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2179afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2180afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2181afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2182fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2183afb2bd1cSJunchao Zhang  #else
2184afb2bd1cSJunchao Zhang   PetscInt k;
2185afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2186ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2187ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2188ccdfe979SStefano Zampini     cublasStatus_t cerr;
2189ccdfe979SStefano Zampini 
21905f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
2191ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2192ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2193ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2194ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2195ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2196ccdfe979SStefano Zampini     blda = B->cmap->n;
2197afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2198afb2bd1cSJunchao Zhang   } else {
2199afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2200ccdfe979SStefano Zampini   }
2201ccdfe979SStefano Zampini 
2202afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2203ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2204afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2205ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2206ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2207ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2208ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2209ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2210afb2bd1cSJunchao Zhang  #endif
22115f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
22125f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(n*2.0*csrmat->num_entries));
22135f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDenseCUDARestoreArrayRead(B,&barray));
2214ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
22155f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22165f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE));
2217ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
22185f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDenseCUDARestoreArrayWrite(mmdata->X,&carray));
22195f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE));
2220ccdfe979SStefano Zampini   } else {
22215f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDenseCUDARestoreArrayWrite(C,&carray));
2222ccdfe979SStefano Zampini   }
2223ccdfe979SStefano Zampini   if (mmdata->cisdense) {
22245f80ce2aSJacob Faibussowitsch     CHKERRQ(MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C));
2225ccdfe979SStefano Zampini   }
2226ccdfe979SStefano Zampini   if (!biscuda) {
22275f80ce2aSJacob Faibussowitsch     CHKERRQ(MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B));
2228ccdfe979SStefano Zampini   }
2229ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2230ccdfe979SStefano Zampini }
2231ccdfe979SStefano Zampini 
2232ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2233ccdfe979SStefano Zampini {
2234ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2235ccdfe979SStefano Zampini   Mat                A,B;
2236ccdfe979SStefano Zampini   PetscInt           m,n;
2237ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2238ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2239ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2240ccdfe979SStefano Zampini 
2241ccdfe979SStefano Zampini   PetscFunctionBegin;
2242ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2243*28b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2244ccdfe979SStefano Zampini   A    = product->A;
2245ccdfe979SStefano Zampini   B    = product->B;
22465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2247*28b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2248ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
22492c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2250ccdfe979SStefano Zampini   switch (product->type) {
2251ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2252ccdfe979SStefano Zampini     m = A->rmap->n;
2253ccdfe979SStefano Zampini     n = B->cmap->n;
2254ccdfe979SStefano Zampini     break;
2255ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2256ccdfe979SStefano Zampini     m = A->cmap->n;
2257ccdfe979SStefano Zampini     n = B->cmap->n;
2258ccdfe979SStefano Zampini     break;
2259ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2260ccdfe979SStefano Zampini     m = A->rmap->n;
2261ccdfe979SStefano Zampini     n = B->rmap->n;
2262ccdfe979SStefano Zampini     break;
2263ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2264ccdfe979SStefano Zampini     m = B->cmap->n;
2265ccdfe979SStefano Zampini     n = B->cmap->n;
2266ccdfe979SStefano Zampini     break;
2267ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2268ccdfe979SStefano Zampini     m = B->rmap->n;
2269ccdfe979SStefano Zampini     n = B->rmap->n;
2270ccdfe979SStefano Zampini     break;
2271ccdfe979SStefano Zampini   default:
227298921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2273ccdfe979SStefano Zampini   }
22745f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(C,m,n,m,n));
2275ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
22765f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense));
22775f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(C,MATSEQDENSECUDA));
2278ccdfe979SStefano Zampini 
2279ccdfe979SStefano Zampini   /* product data */
22805f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&mmdata));
2281ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2282afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2283afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2284ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
22855f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar)));
2286ccdfe979SStefano Zampini   }
2287afb2bd1cSJunchao Zhang  #endif
2288ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2289ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
22905f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X));
22915f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetType(mmdata->X,MATSEQDENSECUDA));
2292ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
22935f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n));
2294ccdfe979SStefano Zampini     } else {
22955f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n));
2296ccdfe979SStefano Zampini     }
2297ccdfe979SStefano Zampini   }
2298ccdfe979SStefano Zampini   C->product->data    = mmdata;
2299ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2300ccdfe979SStefano Zampini 
2301ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2302ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2303ccdfe979SStefano Zampini }
2304ccdfe979SStefano Zampini 
2305fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2306ccdfe979SStefano Zampini {
2307ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2308fcdce8c4SStefano Zampini   Mat                          A,B;
2309fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2310fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2311fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2312fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2313fcdce8c4SStefano Zampini   PetscBool                    flg;
2314fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2315fcdce8c4SStefano Zampini   MatProductType               ptype;
2316fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2317fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2319fcdce8c4SStefano Zampini #endif
2320b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2321ccdfe979SStefano Zampini 
2322ccdfe979SStefano Zampini   PetscFunctionBegin;
2323ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2324*28b400f6SJacob Faibussowitsch   PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
23255f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg));
2326*28b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2327fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2328fcdce8c4SStefano Zampini   A = product->A;
2329fcdce8c4SStefano Zampini   B = product->B;
2330fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2331fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2332fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
23332c71b3e2SJacob Faibussowitsch     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2334fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2335*28b400f6SJacob Faibussowitsch     PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2336fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2337*28b400f6SJacob Faibussowitsch     PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2338fcdce8c4SStefano Zampini     goto finalize;
2339fcdce8c4SStefano Zampini   }
2340fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
23415f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2342*28b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
23435f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2344*28b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2345*28b400f6SJacob Faibussowitsch   PetscCheck(!A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2346*28b400f6SJacob Faibussowitsch   PetscCheck(!B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2347fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2348fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2349fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
23502c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
23512c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
23522c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
23535f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
23545f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
2355fcdce8c4SStefano Zampini 
2356fcdce8c4SStefano Zampini   ptype = product->type;
2357fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2358fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2359*28b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2360fa046f9fSJunchao Zhang   }
2361fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2362fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2363*28b400f6SJacob Faibussowitsch     PetscCheck(product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2364fa046f9fSJunchao Zhang   }
2365fcdce8c4SStefano Zampini   switch (ptype) {
2366fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2367fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2368fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2369fcdce8c4SStefano Zampini     break;
2370fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2371fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2372fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2373fcdce8c4SStefano Zampini     break;
2374fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2375fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2376fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2377fcdce8c4SStefano Zampini     break;
2378fcdce8c4SStefano Zampini   default:
237998921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2380fcdce8c4SStefano Zampini   }
2381fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2382*28b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2383*28b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2384*28b400f6SJacob Faibussowitsch   PetscCheck(Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2385fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2386fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2387fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2388*28b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2389*28b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2390*28b400f6SJacob Faibussowitsch   PetscCheck(Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
23915f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
2392fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2393fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
23945f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2395b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2396b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2397b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2398b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2399b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2400b4285af6SJunchao Zhang   #else
2401b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2402fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2403fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2404fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2405b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2406fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2407fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2408b4285af6SJunchao Zhang   #endif
2409fcdce8c4SStefano Zampini #else
2410b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2411fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2412fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2413fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2414fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2415fcdce8c4SStefano Zampini #endif
24165f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(mmdata->flops));
24175f80ce2aSJacob Faibussowitsch   CHKERRCUDA(WaitForCUDA());
24185f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
2419fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2420fcdce8c4SStefano Zampini finalize:
2421fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
24225f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz));
24235f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n"));
24245f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax));
2425fcdce8c4SStefano Zampini   c->reallocs         = 0;
2426fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2427fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2428fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2429fcdce8c4SStefano Zampini   C->num_ass++;
2430ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2431ccdfe979SStefano Zampini }
2432fcdce8c4SStefano Zampini 
2433fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2434fcdce8c4SStefano Zampini {
2435fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2436fcdce8c4SStefano Zampini   Mat                          A,B;
2437fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2438fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2439fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2440fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2441fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2442fcdce8c4SStefano Zampini   PetscBool                    flg;
2443fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2444fcdce8c4SStefano Zampini   MatProductType               ptype;
2445fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2446fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2447fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2448fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2449fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2450fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2451fcdce8c4SStefano Zampini #else
2452fcdce8c4SStefano Zampini   int                          cnz;
2453fcdce8c4SStefano Zampini #endif
2454b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2455fcdce8c4SStefano Zampini 
2456fcdce8c4SStefano Zampini   PetscFunctionBegin;
2457fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2458*28b400f6SJacob Faibussowitsch   PetscCheck(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2459fcdce8c4SStefano Zampini   A    = product->A;
2460fcdce8c4SStefano Zampini   B    = product->B;
24615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg));
2462*28b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
24635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg));
2464*28b400f6SJacob Faibussowitsch   PetscCheck(flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2465fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2466fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2467fcdce8c4SStefano Zampini   /* product data */
24685f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscNew(&mmdata));
2469fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2470fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2471fcdce8c4SStefano Zampini 
24725f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
24735f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
2474d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2475d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
24762c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
24772c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2478d60bce21SJunchao Zhang 
2479fcdce8c4SStefano Zampini   ptype = product->type;
2480fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2481fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2482fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2483fa046f9fSJunchao Zhang   }
2484fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2485fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2486fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2487fa046f9fSJunchao Zhang   }
2488fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2489fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2490fcdce8c4SStefano Zampini   switch (ptype) {
2491fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2492fcdce8c4SStefano Zampini     m = A->rmap->n;
2493fcdce8c4SStefano Zampini     n = B->cmap->n;
2494fcdce8c4SStefano Zampini     k = A->cmap->n;
2495fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2496fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2497fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2498fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2499fcdce8c4SStefano Zampini     break;
2500fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2501fcdce8c4SStefano Zampini     m = A->cmap->n;
2502fcdce8c4SStefano Zampini     n = B->cmap->n;
2503fcdce8c4SStefano Zampini     k = A->rmap->n;
25045f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
2505fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2506fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2507fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2508fcdce8c4SStefano Zampini     break;
2509fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2510fcdce8c4SStefano Zampini     m = A->rmap->n;
2511fcdce8c4SStefano Zampini     n = B->rmap->n;
2512fcdce8c4SStefano Zampini     k = A->cmap->n;
25135f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
2514fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2515fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2516fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2517fcdce8c4SStefano Zampini     break;
2518fcdce8c4SStefano Zampini   default:
251998921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2520fcdce8c4SStefano Zampini   }
2521fcdce8c4SStefano Zampini 
2522fcdce8c4SStefano Zampini   /* create cusparse matrix */
25235f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(C,m,n,m,n));
25245f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(C,MATSEQAIJCUSPARSE));
2525fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2526fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2527fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2528fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2529fcdce8c4SStefano Zampini 
2530fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2531fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2532fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
25335f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex));
25345f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows));
2535fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2536fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2537fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2538fcdce8c4SStefano Zampini   } else {
2539fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2540fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2541fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2542fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2543fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2544fcdce8c4SStefano Zampini   }
2545fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2546fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2547fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2548fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2549fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2550fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
25515f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
25525f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
25535f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
25545f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
25555f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
25565f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
25575f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
25585f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
25595f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
2560fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2561fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2562fcdce8c4SStefano Zampini     c->nz = 0;
2563fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2564fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2565fcdce8c4SStefano Zampini     goto finalizesym;
2566fcdce8c4SStefano Zampini   }
2567fcdce8c4SStefano Zampini 
2568*28b400f6SJacob Faibussowitsch   PetscCheck(Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2569*28b400f6SJacob Faibussowitsch   PetscCheck(Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2570fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2571fcdce8c4SStefano Zampini   if (!biscompressed) {
2572fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2573fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2574fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2575fcdce8c4SStefano Zampini #endif
2576fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2577fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2578fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2579fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2580fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2581fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2582fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2583fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2584fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2585fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2586fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
25875f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
2588fcdce8c4SStefano Zampini     }
2589fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2590fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2591fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2592fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2593fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2594fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2595fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2596fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2597fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2598fcdce8c4SStefano Zampini     }
2599fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2600fcdce8c4SStefano Zampini #endif
2601fcdce8c4SStefano Zampini   }
2602*28b400f6SJacob Faibussowitsch   PetscCheck(Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2603*28b400f6SJacob Faibussowitsch   PetscCheck(Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2604fcdce8c4SStefano Zampini   /* precompute flops count */
2605fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2606fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2607fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2608fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2609fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2610fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2611fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2612fcdce8c4SStefano Zampini       }
2613fcdce8c4SStefano Zampini     }
2614fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2615fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2616fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2617fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2618fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2619fcdce8c4SStefano Zampini     }
2620fcdce8c4SStefano Zampini   } else { /* TODO */
2621fcdce8c4SStefano Zampini     flops = 0.;
2622fcdce8c4SStefano Zampini   }
2623fcdce8c4SStefano Zampini 
2624fcdce8c4SStefano Zampini   mmdata->flops = flops;
26255f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
2626b4285af6SJunchao Zhang 
2627fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
26285f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2629fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2630fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2631fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2632fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
26335f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc));
2634b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2635b4285af6SJunchao Zhang  {
2636b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2637b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2638b4285af6SJunchao Zhang   */
2639b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2640b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2641b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2642b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2643b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2644b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2645b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2646b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2647b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2648b4285af6SJunchao Zhang 
2649b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2650b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2651b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2652b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2653b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
26545f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**) &dBuffer1, bufferSize1));
2655b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2656b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2657b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2658b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2659b4285af6SJunchao Zhang 
2660b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2661b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2662b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2663b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
26645f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**) &dBuffer2, bufferSize2));
26655f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**) &dBuffer3, bufferSize3));
26665f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4));
2667b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2668b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2669b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
26705f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(dBuffer1));
26715f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(dBuffer2));
2672b4285af6SJunchao Zhang 
2673b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2674b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
26755f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2676b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2677b4285af6SJunchao Zhang   /* allocate matrix C */
2678b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2679b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2680b4285af6SJunchao Zhang   /* update matC with the new pointers */
2681b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2682b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2683b4285af6SJunchao Zhang 
2684b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2685b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2686b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2687b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
26885f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5));
2689b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2690b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2691b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
26925f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaFree(dBuffer3));
2693b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2694b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2695b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2696b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
26975f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024));
2698b4285af6SJunchao Zhang  }
2699ae37ee31SJunchao Zhang  #else
2700b4285af6SJunchao Zhang   size_t bufSize2;
2701fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2702b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2703fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2704fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2705fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
27065f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2));
2707fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2708b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2709fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2710fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2711fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2712fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2713b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2714fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2715fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2716fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2717fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2718fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2719fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2720fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2721fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
27225f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize));
2723fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2724b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2725fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2726fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2727fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2728fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
27295f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1));
2730fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
27315f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024));
2732fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2733fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2735fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2736fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2737fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2738b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2739fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2740fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2741ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2742fcdce8c4SStefano Zampini #else
27435f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST));
2744b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2745fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2746fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2747fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2748fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2749fcdce8c4SStefano Zampini   c->nz = cnz;
2750fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2751fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2752fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2753fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2754fcdce8c4SStefano Zampini 
27555f80ce2aSJacob Faibussowitsch   CHKERRCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE));
2756fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2757fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2758fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2759b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2760fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2761fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2762fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2763fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2764fcdce8c4SStefano Zampini #endif
27655f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(mmdata->flops));
27665f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
2767fcdce8c4SStefano Zampini finalizesym:
2768fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2769fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2770fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
27715f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(m+1,&c->i));
27725f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(c->nz,&c->j));
2773fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2774fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2775fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2776fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2777fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2778fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2779fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27805f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27815f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2782fcdce8c4SStefano Zampini   } else {
2783fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2784fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
27855f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
27865f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
2787fcdce8c4SStefano Zampini   }
2788fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2789fcdce8c4SStefano Zampini     PetscInt r = 0;
2790fcdce8c4SStefano Zampini     c->i[0] = 0;
2791fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2792fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2793fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2794fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2795fcdce8c4SStefano Zampini     }
2796fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2797fcdce8c4SStefano Zampini   }
27985f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
27995f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(m,&c->ilen));
28005f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(m,&c->imax));
2801fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2802fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2803fcdce8c4SStefano Zampini   c->rmax = 0;
2804fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2805fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2806fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2807fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2808fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2809fcdce8c4SStefano Zampini   }
28105f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMarkDiagonal_SeqAIJ(C));
28115f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(c->nz,&c->a));
2812fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2813fcdce8c4SStefano Zampini 
2814fcdce8c4SStefano Zampini   C->nonzerostate++;
28155f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(C->rmap));
28165f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(C->cmap));
2817fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2818fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2819fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2820fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2821fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2822abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2823fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2824fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2825fcdce8c4SStefano Zampini   }
2826fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2827fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2828fcdce8c4SStefano Zampini }
2829fcdce8c4SStefano Zampini 
2830fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2831fcdce8c4SStefano Zampini 
2832fcdce8c4SStefano Zampini /* handles sparse or dense B */
2833fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2834fcdce8c4SStefano Zampini {
2835fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2836fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2837fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2838fcdce8c4SStefano Zampini 
2839fcdce8c4SStefano Zampini   PetscFunctionBegin;
2840fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
28415f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense));
2842abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
28435f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp));
2844fcdce8c4SStefano Zampini   }
2845fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2846fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2847fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
28485f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp));
2849fcdce8c4SStefano Zampini     }
2850fcdce8c4SStefano Zampini   }
285165e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
285265e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
285365e4b4d4SStefano Zampini     switch (product->type) {
285465e4b4d4SStefano Zampini     case MATPRODUCT_AB:
285565e4b4d4SStefano Zampini       if (product->api_user) {
285665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
28575f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
285865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
285965e4b4d4SStefano Zampini       } else {
286065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
28615f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL));
286265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
286365e4b4d4SStefano Zampini       }
286465e4b4d4SStefano Zampini       break;
286565e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
286665e4b4d4SStefano Zampini       if (product->api_user) {
286765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
28685f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
286965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
287065e4b4d4SStefano Zampini       } else {
287165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
28725f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL));
287365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
287465e4b4d4SStefano Zampini       }
287565e4b4d4SStefano Zampini       break;
287665e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
287765e4b4d4SStefano Zampini       if (product->api_user) {
287865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
28795f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
288065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
288165e4b4d4SStefano Zampini       } else {
288265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
28835f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL));
288465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
288565e4b4d4SStefano Zampini       }
288665e4b4d4SStefano Zampini       break;
288765e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
288865e4b4d4SStefano Zampini       if (product->api_user) {
288965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
28905f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
289165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
289265e4b4d4SStefano Zampini       } else {
289365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
28945f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL));
289565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
289665e4b4d4SStefano Zampini       }
289765e4b4d4SStefano Zampini       break;
289865e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
289965e4b4d4SStefano Zampini       if (product->api_user) {
290065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
29015f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
290265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
290365e4b4d4SStefano Zampini       } else {
290465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
29055f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL));
290665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
290765e4b4d4SStefano Zampini       }
290865e4b4d4SStefano Zampini       break;
290965e4b4d4SStefano Zampini     default:
291065e4b4d4SStefano Zampini       break;
291165e4b4d4SStefano Zampini     }
291265e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
291365e4b4d4SStefano Zampini   }
291465e4b4d4SStefano Zampini   /* dispatch */
2915fcdce8c4SStefano Zampini   if (isdense) {
2916ccdfe979SStefano Zampini     switch (product->type) {
2917ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2918ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2919ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2920ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2921ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2922fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
29235f80ce2aSJacob Faibussowitsch         CHKERRQ(MatProductSetFromOptions_SeqAIJ_SeqDense(mat));
2924fcdce8c4SStefano Zampini       } else {
2925fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2926fcdce8c4SStefano Zampini       }
2927fcdce8c4SStefano Zampini       break;
2928fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2929fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2930fcdce8c4SStefano Zampini       break;
2931ccdfe979SStefano Zampini     default:
2932ccdfe979SStefano Zampini       break;
2933ccdfe979SStefano Zampini     }
2934fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2935fcdce8c4SStefano Zampini     switch (product->type) {
2936fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2937fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2938fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2939fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2940fcdce8c4SStefano Zampini       break;
2941fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2942fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2943fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2944fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2945fcdce8c4SStefano Zampini       break;
2946fcdce8c4SStefano Zampini     default:
2947fcdce8c4SStefano Zampini       break;
2948fcdce8c4SStefano Zampini     }
2949fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
29505f80ce2aSJacob Faibussowitsch     CHKERRQ(MatProductSetFromOptions_SeqAIJ(mat));
2951fcdce8c4SStefano Zampini   }
2952ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2953ccdfe979SStefano Zampini }
2954ccdfe979SStefano Zampini 
29556fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
29569ae82921SPaul Mullowney {
29579ae82921SPaul Mullowney   PetscFunctionBegin;
29585f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE));
2959e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2960e6e9a74fSStefano Zampini }
2961e6e9a74fSStefano Zampini 
2962e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2963e6e9a74fSStefano Zampini {
2964e6e9a74fSStefano Zampini   PetscFunctionBegin;
29655f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE));
2966e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2967e6e9a74fSStefano Zampini }
2968e6e9a74fSStefano Zampini 
2969e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2970e6e9a74fSStefano Zampini {
2971e6e9a74fSStefano Zampini   PetscFunctionBegin;
29725f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE));
2973e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2974e6e9a74fSStefano Zampini }
2975e6e9a74fSStefano Zampini 
2976e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2977e6e9a74fSStefano Zampini {
2978e6e9a74fSStefano Zampini   PetscFunctionBegin;
29795f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE));
29809ae82921SPaul Mullowney   PetscFunctionReturn(0);
29819ae82921SPaul Mullowney }
29829ae82921SPaul Mullowney 
29836fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2984ca45077fSPaul Mullowney {
2985ca45077fSPaul Mullowney   PetscFunctionBegin;
29865f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE));
2987ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2988ca45077fSPaul Mullowney }
2989ca45077fSPaul Mullowney 
2990a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2991a0e72f99SJunchao Zhang {
2992a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2993a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2994a0e72f99SJunchao Zhang }
2995a0e72f99SJunchao Zhang 
2996afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2997e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
29989ae82921SPaul Mullowney {
29999ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3000aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
30019ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3002e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3003e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3004e6e9a74fSStefano Zampini   PetscBool                    compressed;
3005afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3006afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3007afb2bd1cSJunchao Zhang #endif
30086e111a19SKarl Rupp 
30099ae82921SPaul Mullowney   PetscFunctionBegin;
30102c71b3e2SJacob Faibussowitsch   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3011cbc6b225SStefano Zampini   if (!a->nz) {
30125f80ce2aSJacob Faibussowitsch     if (!yy) CHKERRQ(VecSet_SeqCUDA(zz,0));
30135f80ce2aSJacob Faibussowitsch     else CHKERRQ(VecCopy_SeqCUDA(yy,zz));
3014e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3015e6e9a74fSStefano Zampini   }
301634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
30175f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
3018e6e9a74fSStefano Zampini   if (!trans) {
30199ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
30205f80ce2aSJacob Faibussowitsch     PetscCheck(matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3021e6e9a74fSStefano Zampini   } else {
30221a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3023e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3024e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3025e6e9a74fSStefano Zampini     } else {
30265f80ce2aSJacob Faibussowitsch       if (!cusparsestruct->matTranspose) CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
3027e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3028e6e9a74fSStefano Zampini     }
3029e6e9a74fSStefano Zampini   }
3030e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3031e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3032213423ffSJunchao Zhang 
3033e6e9a74fSStefano Zampini   try {
30345f80ce2aSJacob Faibussowitsch     CHKERRQ(VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray));
30355f80ce2aSJacob Faibussowitsch     if (yy == zz) CHKERRQ(VecCUDAGetArray(zz,&zarray)); /* read & write zz, so need to get uptodate zarray on GPU */
30365f80ce2aSJacob Faibussowitsch     else CHKERRQ(VecCUDAGetArrayWrite(zz,&zarray)); /* write zz, so no need to init zarray on GPU */
3037afb2bd1cSJunchao Zhang 
30385f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeBegin());
3039e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3040afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3041afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3042afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3043afb2bd1cSJunchao Zhang       */
3044e6e9a74fSStefano Zampini       xptr = xarray;
3045afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3046213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3047afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3048afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3049afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3050afb2bd1cSJunchao Zhang        */
3051afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3052afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3053afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3054afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3055afb2bd1cSJunchao Zhang       }
3056afb2bd1cSJunchao Zhang      #endif
3057e6e9a74fSStefano Zampini     } else {
3058afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3059afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3060afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3061afb2bd1cSJunchao Zhang        */
3062afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3063e6e9a74fSStefano Zampini       dptr = zarray;
3064e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3065afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3066e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3067a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3068e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3069e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3070e6e9a74fSStefano Zampini       }
3071afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3072afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3073afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3074afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3075afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3076afb2bd1cSJunchao Zhang       }
3077afb2bd1cSJunchao Zhang      #endif
3078e6e9a74fSStefano Zampini     }
30799ae82921SPaul Mullowney 
3080afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3081aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3082afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
30835f80ce2aSJacob Faibussowitsch       PetscCheck(opA >= 0 && opA <= 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3084afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
30855f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype));
30865f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype));
30875f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3088afb2bd1cSJunchao Zhang                                                matstruct->matDescr,
3089afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecXDescr, beta,
3090afb2bd1cSJunchao Zhang                                                matstruct->cuSpMV[opA].vecYDescr,
3091afb2bd1cSJunchao Zhang                                                cusparse_scalartype,
3092afb2bd1cSJunchao Zhang                                                cusparsestruct->spmvAlg,
30935f80ce2aSJacob Faibussowitsch                                                &matstruct->cuSpMV[opA].spmvBufferSize));
30945f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize));
3095afb2bd1cSJunchao Zhang 
3096afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3097afb2bd1cSJunchao Zhang       } else {
3098afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
30995f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr));
31005f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr));
3101afb2bd1cSJunchao Zhang       }
3102afb2bd1cSJunchao Zhang 
31035f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA,
3104afb2bd1cSJunchao Zhang                                   matstruct->alpha_one,
31053606e59fSJunchao Zhang                                   matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3106afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecXDescr,
3107afb2bd1cSJunchao Zhang                                   beta,
3108afb2bd1cSJunchao Zhang                                   matstruct->cuSpMV[opA].vecYDescr,
3109afb2bd1cSJunchao Zhang                                   cusparse_scalartype,
3110afb2bd1cSJunchao Zhang                                   cusparsestruct->spmvAlg,
31115f80ce2aSJacob Faibussowitsch                                   matstruct->cuSpMV[opA].spmvBuffer));
3112afb2bd1cSJunchao Zhang      #else
31137656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
31145f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA,
3115a65300a6SPaul Mullowney                                        mat->num_rows, mat->num_cols,
3116afb2bd1cSJunchao Zhang                                        mat->num_entries, matstruct->alpha_one, matstruct->descr,
3117aa372e3fSPaul Mullowney                                        mat->values->data().get(), mat->row_offsets->data().get(),
3118e6e9a74fSStefano Zampini                                        mat->column_indices->data().get(), xptr, beta,
31195f80ce2aSJacob Faibussowitsch                                        dptr));
3120afb2bd1cSJunchao Zhang      #endif
3121aa372e3fSPaul Mullowney     } else {
3122213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3123afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3124afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3125afb2bd1cSJunchao Zhang        #else
3126301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
31275f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA,
3128afb2bd1cSJunchao Zhang                                          matstruct->alpha_one, matstruct->descr, hybMat,
3129e6e9a74fSStefano Zampini                                          xptr, beta,
31305f80ce2aSJacob Faibussowitsch                                          dptr));
3131afb2bd1cSJunchao Zhang        #endif
3132a65300a6SPaul Mullowney       }
3133aa372e3fSPaul Mullowney     }
31345f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeEnd());
3135aa372e3fSPaul Mullowney 
3136e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3137213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3138213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
31395f80ce2aSJacob Faibussowitsch           CHKERRQ(VecCopy_SeqCUDA(yy,zz)); /* zz = yy */
3140e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
31415f80ce2aSJacob Faibussowitsch           CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
31427656d835SStefano Zampini         }
3143213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
31445f80ce2aSJacob Faibussowitsch         CHKERRQ(VecSet_SeqCUDA(zz,0));
31457656d835SStefano Zampini       }
31467656d835SStefano Zampini 
3147213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3148213423ffSJunchao Zhang       if (compressed) {
31495f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogGpuTimeBegin());
3150a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3151a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3152a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3153a0e72f99SJunchao Zhang          */
3154a0e72f99SJunchao Zhang        #if 0
3155a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3156a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3157a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3158e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3159c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3160a0e72f99SJunchao Zhang        #else
3161a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3162a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3163a0e72f99SJunchao Zhang        #endif
31645f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogGpuTimeEnd());
3165e6e9a74fSStefano Zampini       }
3166e6e9a74fSStefano Zampini     } else {
3167e6e9a74fSStefano Zampini       if (yy && yy != zz) {
31685f80ce2aSJacob Faibussowitsch         CHKERRQ(VecAXPY_SeqCUDA(zz,1.0,yy)); /* zz += yy */
3169e6e9a74fSStefano Zampini       }
3170e6e9a74fSStefano Zampini     }
31715f80ce2aSJacob Faibussowitsch     CHKERRQ(VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray));
31725f80ce2aSJacob Faibussowitsch     if (yy == zz) CHKERRQ(VecCUDARestoreArray(zz,&zarray));
31735f80ce2aSJacob Faibussowitsch     else CHKERRQ(VecCUDARestoreArrayWrite(zz,&zarray));
31749ae82921SPaul Mullowney   } catch(char *ex) {
317598921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
31769ae82921SPaul Mullowney   }
3177e6e9a74fSStefano Zampini   if (yy) {
31785f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuFlops(2.0*a->nz));
3179e6e9a74fSStefano Zampini   } else {
31805f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt));
3181e6e9a74fSStefano Zampini   }
31829ae82921SPaul Mullowney   PetscFunctionReturn(0);
31839ae82921SPaul Mullowney }
31849ae82921SPaul Mullowney 
31856fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3186ca45077fSPaul Mullowney {
3187ca45077fSPaul Mullowney   PetscFunctionBegin;
31885f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE));
3189ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3190ca45077fSPaul Mullowney }
3191ca45077fSPaul Mullowney 
31926fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
31939ae82921SPaul Mullowney {
3194042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3195042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
31963fa6b06aSMark Adams 
3197042217e8SBarry Smith   PetscFunctionBegin;
31985f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd_SeqAIJ(A,mode));
3199042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3200042217e8SBarry Smith 
32015f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
32025f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusp->deviceMat));
3203042217e8SBarry Smith     cusp->deviceMat = NULL;
3204042217e8SBarry Smith   }
32059ae82921SPaul Mullowney   PetscFunctionReturn(0);
32069ae82921SPaul Mullowney }
32079ae82921SPaul Mullowney 
32089ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3209e057df02SPaul Mullowney /*@
32109ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3211e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3212e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3213e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3214e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3215e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32169ae82921SPaul Mullowney 
3217d083f849SBarry Smith    Collective
32189ae82921SPaul Mullowney 
32199ae82921SPaul Mullowney    Input Parameters:
32209ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32219ae82921SPaul Mullowney .  m - number of rows
32229ae82921SPaul Mullowney .  n - number of columns
32239ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32249ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32250298fd71SBarry Smith          (possibly different for each row) or NULL
32269ae82921SPaul Mullowney 
32279ae82921SPaul Mullowney    Output Parameter:
32289ae82921SPaul Mullowney .  A - the matrix
32299ae82921SPaul Mullowney 
32309ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
32319ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
32329ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
32339ae82921SPaul Mullowney 
32349ae82921SPaul Mullowney    Notes:
32359ae82921SPaul Mullowney    If nnz is given then nz is ignored
32369ae82921SPaul Mullowney 
32379ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
32389ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
32399ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
32409ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
32419ae82921SPaul Mullowney 
32429ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
32430298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
32449ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
32459ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
32469ae82921SPaul Mullowney 
32479ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
32489ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
32499ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
32509ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
32519ae82921SPaul Mullowney 
32529ae82921SPaul Mullowney    Level: intermediate
32539ae82921SPaul Mullowney 
3254e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
32559ae82921SPaul Mullowney @*/
32569ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
32579ae82921SPaul Mullowney {
32589ae82921SPaul Mullowney   PetscFunctionBegin;
32595f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(comm,A));
32605f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(*A,m,n,m,n));
32615f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE));
32625f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz));
32639ae82921SPaul Mullowney   PetscFunctionReturn(0);
32649ae82921SPaul Mullowney }
32659ae82921SPaul Mullowney 
32666fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
32679ae82921SPaul Mullowney {
32689ae82921SPaul Mullowney   PetscFunctionBegin;
32699ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
32705f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr));
32719ae82921SPaul Mullowney   } else {
32725f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr));
3273aa372e3fSPaul Mullowney   }
32745f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
32755f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
32765f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL));
32775f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
32785f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
32795f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
32805f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL));
32815f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
32825f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
32835f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL));
32845f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy_SeqAIJ(A));
32859ae82921SPaul Mullowney   PetscFunctionReturn(0);
32869ae82921SPaul Mullowney }
32879ae82921SPaul Mullowney 
3288ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
328995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
32909ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
32919ff858a8SKarl Rupp {
32929ff858a8SKarl Rupp   PetscFunctionBegin;
32935f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDuplicate_SeqAIJ(A,cpvalues,B));
32945f80ce2aSJacob Faibussowitsch   CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B));
32959ff858a8SKarl Rupp   PetscFunctionReturn(0);
32969ff858a8SKarl Rupp }
32979ff858a8SKarl Rupp 
3298039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
329995639643SRichard Tran Mills {
3300a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3301039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3302039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3303039c6fbaSStefano Zampini   PetscScalar        *ay;
3304039c6fbaSStefano Zampini   const PetscScalar  *ax;
3305039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3306e6e9a74fSStefano Zampini 
330795639643SRichard Tran Mills   PetscFunctionBegin;
3308a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3309a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3310039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
33115f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
33125f80ce2aSJacob Faibussowitsch     CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str));
3313a587d139SMark     PetscFunctionReturn(0);
331495639643SRichard Tran Mills   }
3315039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
33165f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(Y));
33175f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(X));
33185f80ce2aSJacob Faibussowitsch   PetscCheck(cy->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
33195f80ce2aSJacob Faibussowitsch   PetscCheck(cx->format == MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3320039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3321039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3322039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3323039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3324039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3325039c6fbaSStefano Zampini     if (eq) {
3326039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3327039c6fbaSStefano Zampini     }
3328039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3329039c6fbaSStefano Zampini   }
3330d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3331d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3332039c6fbaSStefano Zampini 
3333039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3334039c6fbaSStefano Zampini     PetscScalar b = 1.0;
3335039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3336039c6fbaSStefano Zampini     size_t      bufferSize;
3337039c6fbaSStefano Zampini     void        *buffer;
3338039c6fbaSStefano Zampini #endif
3339039c6fbaSStefano Zampini 
33405f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33415f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33425f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST));
3343039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33445f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3345039c6fbaSStefano Zampini                                                   &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3346039c6fbaSStefano Zampini                                                   &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33475f80ce2aSJacob Faibussowitsch                                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize));
33485f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc(&buffer,bufferSize));
33495f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeBegin());
33505f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3351039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3352039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33535f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer));
33545f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuFlops(x->nz + y->nz));
33555f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeEnd());
33565f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(buffer));
3357039c6fbaSStefano Zampini #else
33585f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeBegin());
33595f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3360039c6fbaSStefano Zampini                                        &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3361039c6fbaSStefano Zampini                                        &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
33625f80ce2aSJacob Faibussowitsch                                        cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get()));
33635f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuFlops(x->nz + y->nz));
33645f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeEnd());
3365039c6fbaSStefano Zampini #endif
33665f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE));
33675f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33685f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33695f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3370039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3371a587d139SMark     cublasHandle_t cublasv2handle;
3372a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3373039c6fbaSStefano Zampini 
33745f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(X,&ax));
33755f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
33765f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
33775f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscBLASIntCast(x->nz,&bnz));
33785f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeBegin());
33795f80ce2aSJacob Faibussowitsch     CHKERRCUBLAS(cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one));
33805f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuFlops(2.0*bnz));
33815f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeEnd());
33825f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(X,&ax));
33835f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
33845f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
3385039c6fbaSStefano Zampini   } else {
33865f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE));
33875f80ce2aSJacob Faibussowitsch     CHKERRQ(MatAXPY_SeqAIJ(Y,a,X,str));
3388a587d139SMark   }
338995639643SRichard Tran Mills   PetscFunctionReturn(0);
339095639643SRichard Tran Mills }
339195639643SRichard Tran Mills 
339233c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
339333c9ba73SStefano Zampini {
339433c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
339533c9ba73SStefano Zampini   PetscScalar    *ay;
339633c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
339733c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
339833c9ba73SStefano Zampini 
339933c9ba73SStefano Zampini   PetscFunctionBegin;
34005f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEGetArray(Y,&ay));
34015f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscCUBLASGetHandle(&cublasv2handle));
34025f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscBLASIntCast(y->nz,&bnz));
34035f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
34045f80ce2aSJacob Faibussowitsch   CHKERRCUBLAS(cublasXscal(cublasv2handle,bnz,&a,ay,one));
34055f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuFlops(bnz));
34065f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
34075f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSERestoreArray(Y,&ay));
34085f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJInvalidateDiagonal(Y));
340933c9ba73SStefano Zampini   PetscFunctionReturn(0);
341033c9ba73SStefano Zampini }
341133c9ba73SStefano Zampini 
34123fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34133fa6b06aSMark Adams {
34147e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3415a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34167e8381f9SStefano Zampini 
34173fa6b06aSMark Adams   PetscFunctionBegin;
34183fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
34193fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
34207e8381f9SStefano Zampini     if (spptr->mat) {
34217e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
34227e8381f9SStefano Zampini       if (matrix->values) {
34237e8381f9SStefano Zampini         both = PETSC_TRUE;
34247e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34257e8381f9SStefano Zampini       }
34267e8381f9SStefano Zampini     }
34277e8381f9SStefano Zampini     if (spptr->matTranspose) {
34287e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
34297e8381f9SStefano Zampini       if (matrix->values) {
34307e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
34317e8381f9SStefano Zampini       }
34327e8381f9SStefano Zampini     }
34333fa6b06aSMark Adams   }
34345f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(a->a,a->i[A->rmap->n]));
34355f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
34367e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3437a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
34383fa6b06aSMark Adams   PetscFunctionReturn(0);
34393fa6b06aSMark Adams }
34403fa6b06aSMark Adams 
3441a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3442a587d139SMark {
3443a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3444a587d139SMark 
3445a587d139SMark   PetscFunctionBegin;
34469a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
34479a14fc28SStefano Zampini     A->boundtocpu = flg;
34489a14fc28SStefano Zampini     PetscFunctionReturn(0);
34499a14fc28SStefano Zampini   }
3450a587d139SMark   if (flg) {
34515f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSECopyFromGPU(A));
3452a587d139SMark 
345333c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3454a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3455a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3456a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3457a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3458a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3459a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3460a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3461a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3462fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
34635f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps)));
34645f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL));
34655f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL));
34665f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL));
34675f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
34685f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
34695f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ));
34705f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL));
3471a587d139SMark   } else {
347233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3473a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3474a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3475a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3476a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3477a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3478a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3479a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3480a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3481fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
348267a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
348367a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
348467a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
348567a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
348667a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
348767a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
34885f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE));
34895f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34905f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
34915f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE));
34925f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE));
34935f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE));
3494a587d139SMark   }
3495a587d139SMark   A->boundtocpu = flg;
3496ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3497ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3498ea500dcfSRichard Tran Mills   } else {
3499ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3500ea500dcfSRichard Tran Mills   }
3501a587d139SMark   PetscFunctionReturn(0);
3502a587d139SMark }
3503a587d139SMark 
350449735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
35059ae82921SPaul Mullowney {
350649735bf3SStefano Zampini   Mat              B;
35079ae82921SPaul Mullowney 
35089ae82921SPaul Mullowney   PetscFunctionBegin;
35095f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */
351049735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
35115f80ce2aSJacob Faibussowitsch     CHKERRQ(MatDuplicate(A,MAT_COPY_VALUES,newmat));
351249735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
35135f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCopy(A,*newmat,SAME_NONZERO_PATTERN));
351449735bf3SStefano Zampini   }
351549735bf3SStefano Zampini   B = *newmat;
351649735bf3SStefano Zampini 
35175f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(B->defaultvectype));
35185f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrallocpy(VECCUDA,&B->defaultvectype));
351934136279SStefano Zampini 
352049735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
35219ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3522e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
35235f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscNew(&spptr));
35245f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseCreate(&spptr->handle));
35255f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
35261a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3527d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
35288efa179dSJose E. Roman      #if PETSC_PKG_CUDA_VERSION_GE(11,2,0)
3529a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3530a435da06SStefano Zampini      #else
3531d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3532a435da06SStefano Zampini      #endif
3533d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3534d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3535d8132acaSStefano Zampini      #endif
35361a2c6b5cSJunchao Zhang       B->spptr = spptr;
35379ae82921SPaul Mullowney     } else {
3538e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3539e6e9a74fSStefano Zampini 
35405f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscNew(&spptr));
35415f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseCreate(&spptr->handle));
35425f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseSetStream(spptr->handle,PetscDefaultCudaStream));
3543e6e9a74fSStefano Zampini       B->spptr = spptr;
35449ae82921SPaul Mullowney     }
3545e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
354649735bf3SStefano Zampini   }
3547693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
35489ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
35491a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
35509ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
355195639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3552693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
35532205254eSKarl Rupp 
35545f80ce2aSJacob Faibussowitsch   CHKERRQ(MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE));
35555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE));
35565f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE));
3557ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
35585f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
3559ae48a8d0SStefano Zampini #endif
35605f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE));
35619ae82921SPaul Mullowney   PetscFunctionReturn(0);
35629ae82921SPaul Mullowney }
35639ae82921SPaul Mullowney 
356402fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
356502fe1965SBarry Smith {
356602fe1965SBarry Smith   PetscFunctionBegin;
35675f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate_SeqAIJ(B));
35685f80ce2aSJacob Faibussowitsch   CHKERRQ(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B));
356902fe1965SBarry Smith   PetscFunctionReturn(0);
357002fe1965SBarry Smith }
357102fe1965SBarry Smith 
35723ca39a21SBarry Smith /*MC
3573e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3574e057df02SPaul Mullowney 
3575e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
35762692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
35772692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3578e057df02SPaul Mullowney 
3579e057df02SPaul Mullowney    Options Database Keys:
3580e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3581aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3582a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3583365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3584e057df02SPaul Mullowney 
3585e057df02SPaul Mullowney   Level: beginner
3586e057df02SPaul Mullowney 
35878468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3588e057df02SPaul Mullowney M*/
35897f756511SDominic Meiser 
3590bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
35910f39cd5aSBarry Smith 
35923ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
359342c9c57cSBarry Smith {
359442c9c57cSBarry Smith   PetscFunctionBegin;
35955f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band));
35965f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse));
35975f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse));
35985f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse));
35995f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse));
3600bddcd29dSMark Adams 
360142c9c57cSBarry Smith   PetscFunctionReturn(0);
360242c9c57cSBarry Smith }
360329b38603SBarry Smith 
3604cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_SeqAIJCUSPARSE(Mat mat)
3605cbc6b225SStefano Zampini {
3606cbc6b225SStefano Zampini   Mat_SeqAIJCUSPARSE* cusp = (Mat_SeqAIJCUSPARSE*)mat->spptr;
3607cbc6b225SStefano Zampini   cudaError_t         cerr;
3608cbc6b225SStefano Zampini 
3609cbc6b225SStefano Zampini   PetscFunctionBegin;
3610cbc6b225SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3611cbc6b225SStefano Zampini   delete cusp->cooPerm;
3612cbc6b225SStefano Zampini   delete cusp->cooPerm_a;
3613cbc6b225SStefano Zampini   cusp->cooPerm = NULL;
3614cbc6b225SStefano Zampini   cusp->cooPerm_a = NULL;
3615cbc6b225SStefano Zampini   if (cusp->use_extended_coo) {
36165f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusp->jmap_d));
36175f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusp->perm_d));
3618cbc6b225SStefano Zampini   }
3619cbc6b225SStefano Zampini   cusp->use_extended_coo = PETSC_FALSE;
3620cbc6b225SStefano Zampini   PetscFunctionReturn(0);
3621cbc6b225SStefano Zampini }
3622cbc6b225SStefano Zampini 
3623470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36247f756511SDominic Meiser {
36257f756511SDominic Meiser   PetscFunctionBegin;
36267f756511SDominic Meiser   if (*cusparsestruct) {
36275f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format));
36285f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format));
36297f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
363081902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
36317e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
36327e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3633a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
36345f80ce2aSJacob Faibussowitsch     if ((*cusparsestruct)->handle) CHKERRCUSPARSE(cusparseDestroy((*cusparsestruct)->handle));
36355f80ce2aSJacob Faibussowitsch     if ((*cusparsestruct)->jmap_d) CHKERRCUDA(cudaFree((*cusparsestruct)->jmap_d));
36365f80ce2aSJacob Faibussowitsch     if ((*cusparsestruct)->perm_d) CHKERRCUDA(cudaFree((*cusparsestruct)->perm_d));
36375f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(*cusparsestruct));
36387f756511SDominic Meiser   }
36397f756511SDominic Meiser   PetscFunctionReturn(0);
36407f756511SDominic Meiser }
36417f756511SDominic Meiser 
36427f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
36437f756511SDominic Meiser {
36447f756511SDominic Meiser   PetscFunctionBegin;
36457f756511SDominic Meiser   if (*mat) {
36467f756511SDominic Meiser     delete (*mat)->values;
36477f756511SDominic Meiser     delete (*mat)->column_indices;
36487f756511SDominic Meiser     delete (*mat)->row_offsets;
36497f756511SDominic Meiser     delete *mat;
36507f756511SDominic Meiser     *mat = 0;
36517f756511SDominic Meiser   }
36527f756511SDominic Meiser   PetscFunctionReturn(0);
36537f756511SDominic Meiser }
36547f756511SDominic Meiser 
3655470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
36567f756511SDominic Meiser {
36577f756511SDominic Meiser   PetscFunctionBegin;
36587f756511SDominic Meiser   if (*trifactor) {
36595f80ce2aSJacob Faibussowitsch     if ((*trifactor)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr));
36605f80ce2aSJacob Faibussowitsch     if ((*trifactor)->solveInfo) CHKERRCUSPARSE(cusparse_destroy_analysis_info((*trifactor)->solveInfo));
36615f80ce2aSJacob Faibussowitsch     CHKERRQ(CsrMatrix_Destroy(&(*trifactor)->csrMat));
36625f80ce2aSJacob Faibussowitsch     if ((*trifactor)->solveBuffer)   CHKERRCUDA(cudaFree((*trifactor)->solveBuffer));
36635f80ce2aSJacob Faibussowitsch     if ((*trifactor)->AA_h)   CHKERRCUDA(cudaFreeHost((*trifactor)->AA_h));
3664afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
36655f80ce2aSJacob Faibussowitsch     if ((*trifactor)->csr2cscBuffer) CHKERRCUDA(cudaFree((*trifactor)->csr2cscBuffer));
3666afb2bd1cSJunchao Zhang    #endif
36675f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(*trifactor));
36687f756511SDominic Meiser   }
36697f756511SDominic Meiser   PetscFunctionReturn(0);
36707f756511SDominic Meiser }
36717f756511SDominic Meiser 
3672470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
36737f756511SDominic Meiser {
36747f756511SDominic Meiser   CsrMatrix        *mat;
36757f756511SDominic Meiser 
36767f756511SDominic Meiser   PetscFunctionBegin;
36777f756511SDominic Meiser   if (*matstruct) {
36787f756511SDominic Meiser     if ((*matstruct)->mat) {
36797f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3680afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3681afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3682afb2bd1cSJunchao Zhang        #else
36837f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
36845f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseDestroyHybMat(hybMat));
3685afb2bd1cSJunchao Zhang        #endif
36867f756511SDominic Meiser       } else {
36877f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
36887f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
36897f756511SDominic Meiser       }
36907f756511SDominic Meiser     }
36915f80ce2aSJacob Faibussowitsch     if ((*matstruct)->descr) CHKERRCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr));
36927f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
36935f80ce2aSJacob Faibussowitsch     if ((*matstruct)->alpha_one) CHKERRCUDA(cudaFree((*matstruct)->alpha_one));
36945f80ce2aSJacob Faibussowitsch     if ((*matstruct)->beta_zero) CHKERRCUDA(cudaFree((*matstruct)->beta_zero));
36955f80ce2aSJacob Faibussowitsch     if ((*matstruct)->beta_one)  CHKERRCUDA(cudaFree((*matstruct)->beta_one));
3696afb2bd1cSJunchao Zhang 
3697afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3698afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
36995f80ce2aSJacob Faibussowitsch     if (mdata->matDescr) CHKERRCUSPARSE(cusparseDestroySpMat(mdata->matDescr));
3700afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3701afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
37025f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer));
37035f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr));
37045f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr));
3705afb2bd1cSJunchao Zhang       }
3706afb2bd1cSJunchao Zhang     }
3707afb2bd1cSJunchao Zhang    #endif
37087f756511SDominic Meiser     delete *matstruct;
37097e8381f9SStefano Zampini     *matstruct = NULL;
37107f756511SDominic Meiser   }
37117f756511SDominic Meiser   PetscFunctionReturn(0);
37127f756511SDominic Meiser }
37137f756511SDominic Meiser 
3714e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37157f756511SDominic Meiser {
37167f756511SDominic Meiser   PetscFunctionBegin;
37177f756511SDominic Meiser   if (*trifactors) {
37185f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr));
37195f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr));
37205f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose));
37215f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose));
37227f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
37237f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
37247f756511SDominic Meiser     delete (*trifactors)->workVector;
37257e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
37267e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
37277e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
37285f80ce2aSJacob Faibussowitsch     if ((*trifactors)->a_band_d)   CHKERRCUDA(cudaFree((*trifactors)->a_band_d));
37295f80ce2aSJacob Faibussowitsch     if ((*trifactors)->i_band_d)   CHKERRCUDA(cudaFree((*trifactors)->i_band_d));
3730e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3731ccdfe979SStefano Zampini   }
3732ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3733ccdfe979SStefano Zampini }
3734ccdfe979SStefano Zampini 
3735ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3736ccdfe979SStefano Zampini {
3737ccdfe979SStefano Zampini   cusparseHandle_t handle;
3738ccdfe979SStefano Zampini 
3739ccdfe979SStefano Zampini   PetscFunctionBegin;
3740ccdfe979SStefano Zampini   if (*trifactors) {
37415f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSETriFactors_Reset(trifactors));
37427f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
37435f80ce2aSJacob Faibussowitsch       CHKERRCUSPARSE(cusparseDestroy(handle));
37447f756511SDominic Meiser     }
37455f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(*trifactors));
37467f756511SDominic Meiser   }
37477f756511SDominic Meiser   PetscFunctionReturn(0);
37487f756511SDominic Meiser }
37497e8381f9SStefano Zampini 
37507e8381f9SStefano Zampini struct IJCompare
37517e8381f9SStefano Zampini {
37527e8381f9SStefano Zampini   __host__ __device__
37537e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37547e8381f9SStefano Zampini   {
37557e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
37567e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
37577e8381f9SStefano Zampini     return false;
37587e8381f9SStefano Zampini   }
37597e8381f9SStefano Zampini };
37607e8381f9SStefano Zampini 
37617e8381f9SStefano Zampini struct IJEqual
37627e8381f9SStefano Zampini {
37637e8381f9SStefano Zampini   __host__ __device__
37647e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
37657e8381f9SStefano Zampini   {
37667e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
37677e8381f9SStefano Zampini     return true;
37687e8381f9SStefano Zampini   }
37697e8381f9SStefano Zampini };
37707e8381f9SStefano Zampini 
37717e8381f9SStefano Zampini struct IJDiff
37727e8381f9SStefano Zampini {
37737e8381f9SStefano Zampini   __host__ __device__
37747e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37757e8381f9SStefano Zampini   {
37767e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
37777e8381f9SStefano Zampini   }
37787e8381f9SStefano Zampini };
37797e8381f9SStefano Zampini 
37807e8381f9SStefano Zampini struct IJSum
37817e8381f9SStefano Zampini {
37827e8381f9SStefano Zampini   __host__ __device__
37837e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
37847e8381f9SStefano Zampini   {
37857e8381f9SStefano Zampini     return t1||t2;
37867e8381f9SStefano Zampini   }
37877e8381f9SStefano Zampini };
37887e8381f9SStefano Zampini 
37897e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3790219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3791219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
37927e8381f9SStefano Zampini {
37937e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3794fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3795bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
379608391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
37977e8381f9SStefano Zampini   CsrMatrix                             *matrix;
37987e8381f9SStefano Zampini   PetscInt                              n;
37997e8381f9SStefano Zampini 
38007e8381f9SStefano Zampini   PetscFunctionBegin;
3801*28b400f6SJacob Faibussowitsch   PetscCheck(cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
3802*28b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
38037e8381f9SStefano Zampini   if (!cusp->cooPerm) {
38045f80ce2aSJacob Faibussowitsch     CHKERRQ(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY));
38055f80ce2aSJacob Faibussowitsch     CHKERRQ(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY));
38067e8381f9SStefano Zampini     PetscFunctionReturn(0);
38077e8381f9SStefano Zampini   }
38087e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
3809*28b400f6SJacob Faibussowitsch   PetscCheck(matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3810e61fc153SStefano Zampini   if (!v) {
3811e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3812e61fc153SStefano Zampini     goto finalize;
38137e8381f9SStefano Zampini   }
3814e61fc153SStefano Zampini   n = cusp->cooPerm->size();
381508391a17SStefano Zampini   if (isCudaMem(v)) {
381608391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
381708391a17SStefano Zampini   } else {
3818e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3819e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
382008391a17SStefano Zampini     d_v = cooPerm_v->data();
38215f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
382208391a17SStefano Zampini   }
38235f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
3824e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3825ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3826bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
382708391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3828ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3829ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3830ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3831ddea5d60SJunchao Zhang       */
3832e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3833e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3834e61fc153SStefano Zampini       delete cooPerm_w;
38357e8381f9SStefano Zampini     } else {
3836ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
383708391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38387e8381f9SStefano Zampini                                                                 matrix->values->begin()));
383908391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38407e8381f9SStefano Zampini                                                                 matrix->values->end()));
3841ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
38427e8381f9SStefano Zampini     }
38437e8381f9SStefano Zampini   } else {
3844e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
384508391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3846e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
38477e8381f9SStefano Zampini     } else {
384808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
38497e8381f9SStefano Zampini                                                                 matrix->values->begin()));
385008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
38517e8381f9SStefano Zampini                                                                 matrix->values->end()));
38527e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
38537e8381f9SStefano Zampini     }
38547e8381f9SStefano Zampini   }
38555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
3856e61fc153SStefano Zampini finalize:
3857e61fc153SStefano Zampini   delete cooPerm_v;
38587e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
38595f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
3860fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
38615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz));
38625f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n"));
38635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax));
3864fcdce8c4SStefano Zampini   a->reallocs         = 0;
3865fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3866fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3867fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3868fcdce8c4SStefano Zampini   A->num_ass++;
38697e8381f9SStefano Zampini   PetscFunctionReturn(0);
38707e8381f9SStefano Zampini }
38717e8381f9SStefano Zampini 
3872a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3873a49f1ed0SStefano Zampini {
3874a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3875a49f1ed0SStefano Zampini 
3876a49f1ed0SStefano Zampini   PetscFunctionBegin;
3877a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3878a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3879a49f1ed0SStefano Zampini   if (destroy) {
38805f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format));
3881a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3882a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3883a49f1ed0SStefano Zampini   }
38841a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3885a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3886a49f1ed0SStefano Zampini }
3887a49f1ed0SStefano Zampini 
38887e8381f9SStefano Zampini #include <thrust/binary_search.h>
3889219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3890219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
38917e8381f9SStefano Zampini {
38927e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
38937e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
38947e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
38957e8381f9SStefano Zampini 
38967e8381f9SStefano Zampini   PetscFunctionBegin;
38975f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(A->rmap));
38985f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(A->cmap));
38997e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
39007e8381f9SStefano Zampini   if (n != cooPerm_n) {
39017e8381f9SStefano Zampini     delete cusp->cooPerm;
39027e8381f9SStefano Zampini     delete cusp->cooPerm_a;
39037e8381f9SStefano Zampini     cusp->cooPerm = NULL;
39047e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
39057e8381f9SStefano Zampini   }
39067e8381f9SStefano Zampini   if (n) {
39077e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
39087e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39097e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39107e8381f9SStefano Zampini 
39117e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
39127e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39137e8381f9SStefano Zampini 
39145f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
39157e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
39167e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
3917ddea5d60SJunchao Zhang 
3918ddea5d60SJunchao Zhang     /* Ex.
3919ddea5d60SJunchao Zhang       n = 6
3920ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
3921ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
3922ddea5d60SJunchao Zhang     */
39237e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
39247e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
39257e8381f9SStefano Zampini 
39265f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeBegin());
39277e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
3928ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
3929ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
39307e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
39317e8381f9SStefano Zampini 
3932ddea5d60SJunchao Zhang     /*
3933ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
3934ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
3935ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
3936ddea5d60SJunchao Zhang     */
3937ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
3938ddea5d60SJunchao Zhang 
3939ddea5d60SJunchao Zhang     /*
3940ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
3941ddea5d60SJunchao Zhang                             ^ekey
3942ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
3943ddea5d60SJunchao Zhang                            ^nekye
3944ddea5d60SJunchao Zhang     */
39457e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
39467e8381f9SStefano Zampini       delete cusp->cooPerm_a;
39477e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
3948ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
3949ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
3950ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
3951ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
3952ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
39537e8381f9SStefano Zampini       w[0] = 0;
3954ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
3955ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
39567e8381f9SStefano Zampini     }
39577e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
3958ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
3959ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
3960ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
39615f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeEnd());
39627e8381f9SStefano Zampini 
39635f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i));
39647e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
39657e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
39667e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
39675f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(A->rmap->n+1,&a->i));
3968ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
39695f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39707e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3971fcdce8c4SStefano Zampini     a->rmax = 0;
39725f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(a->nz,&a->a));
39735f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(a->nz,&a->j));
39745f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost));
39755f80ce2aSJacob Faibussowitsch     if (!a->ilen) CHKERRQ(PetscMalloc1(A->rmap->n,&a->ilen));
39765f80ce2aSJacob Faibussowitsch     if (!a->imax) CHKERRQ(PetscMalloc1(A->rmap->n,&a->imax));
39777e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
39787e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
39797e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
39807e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3981fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
39827e8381f9SStefano Zampini     }
3983fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
39847e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
39855f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt)));
39865f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMarkDiagonal_SeqAIJ(A));
39877e8381f9SStefano Zampini   } else {
39885f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJSetPreallocation(A,0,NULL));
39897e8381f9SStefano Zampini   }
39905f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE));
39917e8381f9SStefano Zampini 
39927e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3993e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
39945f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscArrayzero(a->a,a->nz));
39955f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6));
39967e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
39975f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
39985f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE));
39997e8381f9SStefano Zampini   PetscFunctionReturn(0);
40007e8381f9SStefano Zampini }
4001ed502f03SStefano Zampini 
4002219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4003219fbbafSJunchao Zhang {
4004219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4005219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4006cbc6b225SStefano Zampini   PetscBool          coo_basic = PETSC_TRUE;
4007219fbbafSJunchao Zhang   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4008219fbbafSJunchao Zhang 
4009219fbbafSJunchao Zhang   PetscFunctionBegin;
40105f80ce2aSJacob Faibussowitsch   CHKERRQ(MatResetPreallocationCOO_SeqAIJ(mat));
40115f80ce2aSJacob Faibussowitsch   CHKERRQ(MatResetPreallocationCOO_SeqAIJCUSPARSE(mat));
4012219fbbafSJunchao Zhang   if (coo_i) {
40135f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscGetMemType(coo_i,&mtype));
4014219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4015219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) {
4016cbc6b225SStefano Zampini         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = PETSC_FALSE; break;}
4017219fbbafSJunchao Zhang       }
4018219fbbafSJunchao Zhang     }
4019219fbbafSJunchao Zhang   }
4020219fbbafSJunchao Zhang 
4021219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
40225f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
4023219fbbafSJunchao Zhang   } else {
40245f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j));
4025cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
40265f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mat));
4027219fbbafSJunchao Zhang     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4028219fbbafSJunchao Zhang     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
40295f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount)));
40305f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
40315f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount)));
40325f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice));
4033219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4034219fbbafSJunchao Zhang   }
4035219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4036219fbbafSJunchao Zhang }
4037219fbbafSJunchao Zhang 
4038b6c38306SJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4039219fbbafSJunchao Zhang {
4040219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4041219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
4042b6c38306SJunchao Zhang   for (; i<nnz; i+= grid_size) {
4043b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4044b6c38306SJunchao Zhang     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4045b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4046b6c38306SJunchao Zhang   }
4047219fbbafSJunchao Zhang }
4048219fbbafSJunchao Zhang 
4049219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4050219fbbafSJunchao Zhang {
4051219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4052219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4053219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4054219fbbafSJunchao Zhang   PetscMemType        memtype;
4055219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4056219fbbafSJunchao Zhang   PetscScalar         *Aa;
4057219fbbafSJunchao Zhang 
4058219fbbafSJunchao Zhang   PetscFunctionBegin;
4059219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
40605f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscGetMemType(v,&memtype));
4061219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
40625f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar)));
40635f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
4064219fbbafSJunchao Zhang     }
4065219fbbafSJunchao Zhang 
40665f80ce2aSJacob Faibussowitsch     if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa));
40675f80ce2aSJacob Faibussowitsch     else CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa));
4068219fbbafSJunchao Zhang 
4069cbc6b225SStefano Zampini     if (Annz) {
4070b6c38306SJunchao Zhang       MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4071cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
4072cbc6b225SStefano Zampini     }
4073219fbbafSJunchao Zhang 
40745f80ce2aSJacob Faibussowitsch     if (imode == INSERT_VALUES) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
40755f80ce2aSJacob Faibussowitsch     else CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
4076219fbbafSJunchao Zhang 
40775f80ce2aSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1));
4078219fbbafSJunchao Zhang   } else {
40795f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode));
4080219fbbafSJunchao Zhang   }
4081219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4082219fbbafSJunchao Zhang }
4083219fbbafSJunchao Zhang 
40845b7e41feSStefano Zampini /*@C
40855b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40865b7e41feSStefano Zampini 
40875b7e41feSStefano Zampini    Not collective
40885b7e41feSStefano Zampini 
40895b7e41feSStefano Zampini     Input Parameters:
40905b7e41feSStefano Zampini +   A - the matrix
40915b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40925b7e41feSStefano Zampini 
40935b7e41feSStefano Zampini     Output Parameters:
40945b7e41feSStefano Zampini +   ia - the CSR row pointers
40955b7e41feSStefano Zampini -   ja - the CSR column indices
40965b7e41feSStefano Zampini 
40975b7e41feSStefano Zampini     Level: developer
40985b7e41feSStefano Zampini 
40995b7e41feSStefano Zampini     Notes:
41005b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
41015b7e41feSStefano Zampini 
41025b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
41035b7e41feSStefano Zampini @*/
41045f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41055f101d05SStefano Zampini {
41065f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
41075f101d05SStefano Zampini   CsrMatrix          *csr;
41085f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
41095f101d05SStefano Zampini 
41105f101d05SStefano Zampini   PetscFunctionBegin;
41115f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41125f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41135f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41142c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41155f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4116*28b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41175f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41185f101d05SStefano Zampini   if (i) {
41195f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41205f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41215f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41225f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41235f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
41245f101d05SStefano Zampini       }
41255f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41265f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41275f101d05SStefano Zampini   }
41285f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41295f101d05SStefano Zampini   PetscFunctionReturn(0);
41305f101d05SStefano Zampini }
41315f101d05SStefano Zampini 
41325b7e41feSStefano Zampini /*@C
41335b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41345b7e41feSStefano Zampini 
41355b7e41feSStefano Zampini    Not collective
41365b7e41feSStefano Zampini 
41375b7e41feSStefano Zampini     Input Parameters:
41385b7e41feSStefano Zampini +   A - the matrix
41395b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41405b7e41feSStefano Zampini 
41415b7e41feSStefano Zampini     Output Parameters:
41425b7e41feSStefano Zampini +   ia - the CSR row pointers
41435b7e41feSStefano Zampini -   ja - the CSR column indices
41445b7e41feSStefano Zampini 
41455b7e41feSStefano Zampini     Level: developer
41465b7e41feSStefano Zampini 
41475b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41485b7e41feSStefano Zampini @*/
41495f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41505f101d05SStefano Zampini {
41515f101d05SStefano Zampini   PetscFunctionBegin;
41525f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41535f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41545f101d05SStefano Zampini   if (i) *i = NULL;
41555f101d05SStefano Zampini   if (j) *j = NULL;
41565f101d05SStefano Zampini   PetscFunctionReturn(0);
41575f101d05SStefano Zampini }
41585f101d05SStefano Zampini 
41595b7e41feSStefano Zampini /*@C
41605b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41615b7e41feSStefano Zampini 
41625b7e41feSStefano Zampini    Not Collective
41635b7e41feSStefano Zampini 
41645b7e41feSStefano Zampini    Input Parameter:
41655b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41665b7e41feSStefano Zampini 
41675b7e41feSStefano Zampini    Output Parameter:
41685b7e41feSStefano Zampini .   a - pointer to the device data
41695b7e41feSStefano Zampini 
41705b7e41feSStefano Zampini    Level: developer
41715b7e41feSStefano Zampini 
41725b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41735b7e41feSStefano Zampini 
41745b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41755b7e41feSStefano Zampini @*/
4176ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4177ed502f03SStefano Zampini {
4178ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4179ed502f03SStefano Zampini   CsrMatrix          *csr;
4180ed502f03SStefano Zampini 
4181ed502f03SStefano Zampini   PetscFunctionBegin;
4182ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4183ed502f03SStefano Zampini   PetscValidPointer(a,2);
4184ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41852c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41865f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4187*28b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4188ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4189*28b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4190ed502f03SStefano Zampini   *a = csr->values->data().get();
4191ed502f03SStefano Zampini   PetscFunctionReturn(0);
4192ed502f03SStefano Zampini }
4193ed502f03SStefano Zampini 
41945b7e41feSStefano Zampini /*@C
41955b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
41965b7e41feSStefano Zampini 
41975b7e41feSStefano Zampini    Not Collective
41985b7e41feSStefano Zampini 
41995b7e41feSStefano Zampini    Input Parameter:
42005b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42015b7e41feSStefano Zampini 
42025b7e41feSStefano Zampini    Output Parameter:
42035b7e41feSStefano Zampini .   a - pointer to the device data
42045b7e41feSStefano Zampini 
42055b7e41feSStefano Zampini    Level: developer
42065b7e41feSStefano Zampini 
42075b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
42085b7e41feSStefano Zampini @*/
4209ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4210ed502f03SStefano Zampini {
4211ed502f03SStefano Zampini   PetscFunctionBegin;
4212ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4213ed502f03SStefano Zampini   PetscValidPointer(a,2);
4214ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4215ed502f03SStefano Zampini   *a = NULL;
4216ed502f03SStefano Zampini   PetscFunctionReturn(0);
4217ed502f03SStefano Zampini }
4218ed502f03SStefano Zampini 
42195b7e41feSStefano Zampini /*@C
42205b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42215b7e41feSStefano Zampini 
42225b7e41feSStefano Zampini    Not Collective
42235b7e41feSStefano Zampini 
42245b7e41feSStefano Zampini    Input Parameter:
42255b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42265b7e41feSStefano Zampini 
42275b7e41feSStefano Zampini    Output Parameter:
42285b7e41feSStefano Zampini .   a - pointer to the device data
42295b7e41feSStefano Zampini 
42305b7e41feSStefano Zampini    Level: developer
42315b7e41feSStefano Zampini 
42325b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42335b7e41feSStefano Zampini 
42345b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
42355b7e41feSStefano Zampini @*/
4236039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4237039c6fbaSStefano Zampini {
4238039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4239039c6fbaSStefano Zampini   CsrMatrix          *csr;
4240039c6fbaSStefano Zampini 
4241039c6fbaSStefano Zampini   PetscFunctionBegin;
4242039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4243039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4244039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42452c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
42465f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
4247*28b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4248039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4249*28b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4250039c6fbaSStefano Zampini   *a = csr->values->data().get();
4251039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
42525f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4253039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4254039c6fbaSStefano Zampini }
42555b7e41feSStefano Zampini /*@C
42565b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4257039c6fbaSStefano Zampini 
42585b7e41feSStefano Zampini    Not Collective
42595b7e41feSStefano Zampini 
42605b7e41feSStefano Zampini    Input Parameter:
42615b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42625b7e41feSStefano Zampini 
42635b7e41feSStefano Zampini    Output Parameter:
42645b7e41feSStefano Zampini .   a - pointer to the device data
42655b7e41feSStefano Zampini 
42665b7e41feSStefano Zampini    Level: developer
42675b7e41feSStefano Zampini 
42685b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42695b7e41feSStefano Zampini @*/
4270039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4271039c6fbaSStefano Zampini {
4272039c6fbaSStefano Zampini   PetscFunctionBegin;
4273039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4274039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4275039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42765f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
42775f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
4278039c6fbaSStefano Zampini   *a = NULL;
4279039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4280039c6fbaSStefano Zampini }
4281039c6fbaSStefano Zampini 
42825b7e41feSStefano Zampini /*@C
42835b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42845b7e41feSStefano Zampini 
42855b7e41feSStefano Zampini    Not Collective
42865b7e41feSStefano Zampini 
42875b7e41feSStefano Zampini    Input Parameter:
42885b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42895b7e41feSStefano Zampini 
42905b7e41feSStefano Zampini    Output Parameter:
42915b7e41feSStefano Zampini .   a - pointer to the device data
42925b7e41feSStefano Zampini 
42935b7e41feSStefano Zampini    Level: developer
42945b7e41feSStefano Zampini 
42955b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
42965b7e41feSStefano Zampini 
42975b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
42985b7e41feSStefano Zampini @*/
4299ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4300ed502f03SStefano Zampini {
4301ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4302ed502f03SStefano Zampini   CsrMatrix          *csr;
4303ed502f03SStefano Zampini 
4304ed502f03SStefano Zampini   PetscFunctionBegin;
4305ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4306ed502f03SStefano Zampini   PetscValidPointer(a,2);
4307ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
43082c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4309*28b400f6SJacob Faibussowitsch   PetscCheck(cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4310ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4311*28b400f6SJacob Faibussowitsch   PetscCheck(csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4312ed502f03SStefano Zampini   *a = csr->values->data().get();
4313039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
43145f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE));
4315ed502f03SStefano Zampini   PetscFunctionReturn(0);
4316ed502f03SStefano Zampini }
4317ed502f03SStefano Zampini 
43185b7e41feSStefano Zampini /*@C
43195b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43205b7e41feSStefano Zampini 
43215b7e41feSStefano Zampini    Not Collective
43225b7e41feSStefano Zampini 
43235b7e41feSStefano Zampini    Input Parameter:
43245b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43255b7e41feSStefano Zampini 
43265b7e41feSStefano Zampini    Output Parameter:
43275b7e41feSStefano Zampini .   a - pointer to the device data
43285b7e41feSStefano Zampini 
43295b7e41feSStefano Zampini    Level: developer
43305b7e41feSStefano Zampini 
43315b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
43325b7e41feSStefano Zampini @*/
4333ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4334ed502f03SStefano Zampini {
4335ed502f03SStefano Zampini   PetscFunctionBegin;
4336ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4337ed502f03SStefano Zampini   PetscValidPointer(a,2);
4338ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
43395f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJInvalidateDiagonal(A));
43405f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectStateIncrease((PetscObject)A));
4341ed502f03SStefano Zampini   *a = NULL;
4342ed502f03SStefano Zampini   PetscFunctionReturn(0);
4343ed502f03SStefano Zampini }
4344ed502f03SStefano Zampini 
4345ed502f03SStefano Zampini struct IJCompare4
4346ed502f03SStefano Zampini {
4347ed502f03SStefano Zampini   __host__ __device__
43482ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4349ed502f03SStefano Zampini   {
4350ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4351ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4352ed502f03SStefano Zampini     return false;
4353ed502f03SStefano Zampini   }
4354ed502f03SStefano Zampini };
4355ed502f03SStefano Zampini 
43568909a122SStefano Zampini struct Shift
43578909a122SStefano Zampini {
4358ed502f03SStefano Zampini   int _shift;
4359ed502f03SStefano Zampini 
4360ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4361ed502f03SStefano Zampini   __host__ __device__
4362ed502f03SStefano Zampini   inline int operator() (const int &c)
4363ed502f03SStefano Zampini   {
4364ed502f03SStefano Zampini     return c + _shift;
4365ed502f03SStefano Zampini   }
4366ed502f03SStefano Zampini };
4367ed502f03SStefano Zampini 
4368ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4369ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4370ed502f03SStefano Zampini {
4371ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4372ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4373ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4374ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4375ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4376ed502f03SStefano Zampini   cusparseStatus_t             stat;
4377ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4378ed502f03SStefano Zampini 
4379ed502f03SStefano Zampini   PetscFunctionBegin;
4380ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4381ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4382ed502f03SStefano Zampini   PetscValidPointer(C,4);
4383ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4384ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
43855f80ce2aSJacob Faibussowitsch   PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
43862c71b3e2SJacob Faibussowitsch   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
43872c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
43882c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4389ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4390ed502f03SStefano Zampini     m     = A->rmap->n;
4391ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
43925f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreate(PETSC_COMM_SELF,C));
43935f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetSizes(*C,m,n,m,n));
43945f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetType(*C,MATSEQAIJCUSPARSE));
4395ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4396ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4397ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4398ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4399ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4400ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4401ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4402ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4403ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4404ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4405ed502f03SStefano Zampini     Ccusp->nrows    = m;
4406ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4407ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4408ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4409ed502f03SStefano Zampini     Ccsr->num_cols  = n;
44105f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseCreateMatDescr(&Cmat->descr));
44115f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO));
44125f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
44135f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar)));
44145f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar)));
44155f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar)));
44165f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44175f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
44185f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
44195f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
44205f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
4421*28b400f6SJacob Faibussowitsch     PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4422*28b400f6SJacob Faibussowitsch     PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4423ed502f03SStefano Zampini 
4424ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4425ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4426ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4427ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4428ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4429ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4430ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4431ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4432ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4433ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4434ed502f03SStefano Zampini     if (c->nz) {
44352ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44362ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44372ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44382ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44392ed87e7eSStefano Zampini 
4440ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4441ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4442ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4443ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
44445f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt)));
4445ed502f03SStefano Zampini         }
44462ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44472ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4448ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4449ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4450ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4451ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
44525f80ce2aSJacob Faibussowitsch           CHKERRQ(PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt)));
4453ed502f03SStefano Zampini         }
44542ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44552ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
44565f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogGpuTimeBegin());
44572ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44582ed87e7eSStefano Zampini                               Aroff->data().get(),
44592ed87e7eSStefano Zampini                               Annz,
44602ed87e7eSStefano Zampini                               m,
44612ed87e7eSStefano Zampini                               Acoo->data().get(),
44622ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4463ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44642ed87e7eSStefano Zampini                               Broff->data().get(),
4465ed502f03SStefano Zampini                               Bnnz,
4466ed502f03SStefano Zampini                               m,
44672ed87e7eSStefano Zampini                               Bcoo->data().get(),
4468ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
44692ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44702ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44712ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44728909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4473ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4474ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44758909a122SStefano Zampini #else
44768909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44778909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44788909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44798909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44808909a122SStefano Zampini #endif
44812ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44822ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44832ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44842ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
44852ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
44862ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4487ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4488ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4489ed502f03SStefano Zampini       thrust::advance(p2,Annz);
44902ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
44918909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
44928909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
44938909a122SStefano Zampini #endif
44942ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
44952ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
44962ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
44972ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
44982ed87e7eSStefano Zampini #else
44992ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
45002ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
45012ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
45022ed87e7eSStefano Zampini #endif
4503ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
45042ed87e7eSStefano Zampini                               Ccoo->data().get(),
4505ed502f03SStefano Zampini                               c->nz,
4506ed502f03SStefano Zampini                               m,
4507ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4508ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
45095f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogGpuTimeEnd());
45102ed87e7eSStefano Zampini       delete wPerm;
45112ed87e7eSStefano Zampini       delete Acoo;
45122ed87e7eSStefano Zampini       delete Bcoo;
45132ed87e7eSStefano Zampini       delete Ccoo;
4514ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4515ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4516ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4517ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4518ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4519ed502f03SStefano Zampini #endif
45201a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45215f80ce2aSJacob Faibussowitsch         CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(A));
45225f80ce2aSJacob Faibussowitsch         CHKERRQ(MatSeqAIJCUSPARSEFormExplicitTranspose(B));
4523ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4524ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4525ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4526ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4527ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4528ed502f03SStefano Zampini 
45291a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45301a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4531a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4532ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4533ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4534ed502f03SStefano Zampini         CcsrT->num_rows = n;
4535ed502f03SStefano Zampini         CcsrT->num_cols = m;
4536ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4537ed502f03SStefano Zampini 
4538ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4539ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4540ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4541ed502f03SStefano Zampini 
45425f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogGpuTimeBegin());
4543ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4544ed502f03SStefano Zampini         if (AT) {
4545ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4546ed502f03SStefano Zampini           thrust::advance(rT,-1);
4547ed502f03SStefano Zampini         }
4548ed502f03SStefano Zampini         if (BT) {
4549ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4550ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4551ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4552ed502f03SStefano Zampini         }
4553ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4554ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4555ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4556ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4557ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4558ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
45595f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscLogGpuTimeEnd());
4560ed502f03SStefano Zampini 
45615f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseCreateMatDescr(&CmatT->descr));
45625f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO));
45635f80ce2aSJacob Faibussowitsch         CHKERRCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL));
45645f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar)));
45655f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar)));
45665f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar)));
45675f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
45685f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice));
45695f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice));
4570ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4571ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4572ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4573ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4574ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4575ed502f03SStefano Zampini #endif
4576ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4577ed502f03SStefano Zampini       }
4578ed502f03SStefano Zampini     }
4579ed502f03SStefano Zampini 
4580ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4581ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4582ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
45835f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(m+1,&c->i));
45845f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(c->nz,&c->j));
4585ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4586ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4587ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4588ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4589ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
45905f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45915f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4592ed502f03SStefano Zampini     } else {
45935f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
45945f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost));
4595ed502f03SStefano Zampini     }
45965f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt)));
45975f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(m,&c->ilen));
45985f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(m,&c->imax));
4599ed502f03SStefano Zampini     c->maxnz = c->nz;
4600ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4601ed502f03SStefano Zampini     c->rmax = 0;
4602ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4603ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4604ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4605ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4606ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4607ed502f03SStefano Zampini     }
46085f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMarkDiagonal_SeqAIJ(*C));
46095f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(c->nz,&c->a));
4610ed502f03SStefano Zampini     (*C)->nonzerostate++;
46115f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutSetUp((*C)->rmap));
46125f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutSetUp((*C)->cmap));
4613ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4614ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4615ed502f03SStefano Zampini   } else {
46162c71b3e2SJacob Faibussowitsch     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4617ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4618ed502f03SStefano Zampini     if (c->nz) {
4619ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
46205f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
46212c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
46222c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
46235f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
46245f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(B));
46255f80ce2aSJacob Faibussowitsch       PetscCheck(Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
46265f80ce2aSJacob Faibussowitsch       PetscCheck(Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4627ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4628ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4629ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
46302c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
46312c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
46322c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
46332c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
46345f80ce2aSJacob Faibussowitsch       PetscCheck(Ccusp->cooPerm->size() == Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4635ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4636ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
46375f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogGpuTimeBegin());
4638ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4639ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4640ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4641ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4642ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4643ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4644ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4645ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4646ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4647ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
46485f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE));
46491a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
46505f80ce2aSJacob Faibussowitsch         PetscCheck(Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4651ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4652ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4653ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4654ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4655ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4656ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4657ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46581a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4659ed502f03SStefano Zampini       }
46605f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogGpuTimeEnd());
4661ed502f03SStefano Zampini     }
4662ed502f03SStefano Zampini   }
46635f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectStateIncrease((PetscObject)*C));
4664ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4665ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4666ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4667ed502f03SStefano Zampini   PetscFunctionReturn(0);
4668ed502f03SStefano Zampini }
4669c215019aSStefano Zampini 
4670c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4671c215019aSStefano Zampini {
4672c215019aSStefano Zampini   bool              dmem;
4673c215019aSStefano Zampini   const PetscScalar *av;
4674c215019aSStefano Zampini 
4675c215019aSStefano Zampini   PetscFunctionBegin;
4676c215019aSStefano Zampini   dmem = isCudaMem(v);
46775f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEGetArrayRead(A,&av));
4678c215019aSStefano Zampini   if (n && idx) {
4679c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4680c215019aSStefano Zampini     widx.assign(idx,idx+n);
46815f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscInt)));
4682c215019aSStefano Zampini 
4683c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4684c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4685c215019aSStefano Zampini     if (dmem) {
4686c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4687c215019aSStefano Zampini     } else {
4688c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4689c215019aSStefano Zampini       dv = w->data();
4690c215019aSStefano Zampini     }
4691c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4692c215019aSStefano Zampini 
4693c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4694c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4695c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4696c215019aSStefano Zampini     if (w) {
46975f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
4698c215019aSStefano Zampini     }
4699c215019aSStefano Zampini     delete w;
4700c215019aSStefano Zampini   } else {
47015f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost));
4702c215019aSStefano Zampini   }
47035f80ce2aSJacob Faibussowitsch   if (!dmem) CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
47045f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSERestoreArrayRead(A,&av));
4705c215019aSStefano Zampini   PetscFunctionReturn(0);
4706c215019aSStefano Zampini }
4707