xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision c0aa6a63a7860d309a895cb4aa0f9e11e7859f3a)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
88042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9157181aedSStefano Zampini 
927e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
937e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
947e8381f9SStefano Zampini 
95c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
96c215019aSStefano Zampini 
97b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
98b06137fdSPaul Mullowney {
99b06137fdSPaul Mullowney   cusparseStatus_t   stat;
100b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101b06137fdSPaul Mullowney 
102b06137fdSPaul Mullowney   PetscFunctionBegin;
103d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
104b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10557d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
106b06137fdSPaul Mullowney   PetscFunctionReturn(0);
107b06137fdSPaul Mullowney }
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
110b06137fdSPaul Mullowney {
111b06137fdSPaul Mullowney   cusparseStatus_t   stat;
112b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
113b06137fdSPaul Mullowney 
114b06137fdSPaul Mullowney   PetscFunctionBegin;
115d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1166b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11716a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11857d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11916a2e217SAlejandro Lamas Daviña     }
120b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1216b1cf21dSAlejandro Lamas Daviña   }
12257d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
123b06137fdSPaul Mullowney   PetscFunctionReturn(0);
124b06137fdSPaul Mullowney }
125b06137fdSPaul Mullowney 
126b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
127b06137fdSPaul Mullowney {
128b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1297e8381f9SStefano Zampini   PetscBool          flg;
1307e8381f9SStefano Zampini   PetscErrorCode     ierr;
131ccdfe979SStefano Zampini 
132b06137fdSPaul Mullowney   PetscFunctionBegin;
1337e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1347e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
135ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
136b06137fdSPaul Mullowney   PetscFunctionReturn(0);
137b06137fdSPaul Mullowney }
138b06137fdSPaul Mullowney 
139ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1409ae82921SPaul Mullowney {
1419ae82921SPaul Mullowney   PetscFunctionBegin;
1429ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1439ae82921SPaul Mullowney   PetscFunctionReturn(0);
1449ae82921SPaul Mullowney }
1459ae82921SPaul Mullowney 
146c708e6cdSJed Brown /*MC
147087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
148087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
149087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
150087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
151087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
152087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
153c708e6cdSJed Brown 
1549ae82921SPaul Mullowney   Level: beginner
155c708e6cdSJed Brown 
1563ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
157c708e6cdSJed Brown M*/
1589ae82921SPaul Mullowney 
15942c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1609ae82921SPaul Mullowney {
1619ae82921SPaul Mullowney   PetscErrorCode ierr;
162bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1639ae82921SPaul Mullowney 
1649ae82921SPaul Mullowney   PetscFunctionBegin;
165bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
166bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1672c7c0729SBarry Smith   (*B)->factortype = ftype;
1689ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1692205254eSKarl Rupp 
1709c1083e7SRichard Tran Mills   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
171087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
17233d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1739c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1749ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1759ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1769c1083e7SRichard Tran Mills     } else {
1779c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1789c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1799c1083e7SRichard Tran Mills     }
1804ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1814ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1824ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
183087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1849c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
185087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
186087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1879c1083e7SRichard Tran Mills     } else {
1889c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1899c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1909c1083e7SRichard Tran Mills     }
1914ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1924ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1939ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
194bc3f50f2SPaul Mullowney 
195fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1964ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1973ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1989ae82921SPaul Mullowney   PetscFunctionReturn(0);
1999ae82921SPaul Mullowney }
2009ae82921SPaul Mullowney 
201bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
202ca45077fSPaul Mullowney {
203aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2046e111a19SKarl Rupp 
205ca45077fSPaul Mullowney   PetscFunctionBegin;
206ca45077fSPaul Mullowney   switch (op) {
207e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
208aa372e3fSPaul Mullowney     cusparsestruct->format = format;
209ca45077fSPaul Mullowney     break;
210e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
211aa372e3fSPaul Mullowney     cusparsestruct->format = format;
212ca45077fSPaul Mullowney     break;
213ca45077fSPaul Mullowney   default:
21436d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
215ca45077fSPaul Mullowney   }
216ca45077fSPaul Mullowney   PetscFunctionReturn(0);
217ca45077fSPaul Mullowney }
2189ae82921SPaul Mullowney 
219e057df02SPaul Mullowney /*@
220e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
221e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
222aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
223e057df02SPaul Mullowney    Not Collective
224e057df02SPaul Mullowney 
225e057df02SPaul Mullowney    Input Parameters:
2268468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
22736d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2282692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
229e057df02SPaul Mullowney 
230e057df02SPaul Mullowney    Output Parameter:
231e057df02SPaul Mullowney 
232e057df02SPaul Mullowney    Level: intermediate
233e057df02SPaul Mullowney 
2348468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
235e057df02SPaul Mullowney @*/
236e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
237e057df02SPaul Mullowney {
238e057df02SPaul Mullowney   PetscErrorCode ierr;
2396e111a19SKarl Rupp 
240e057df02SPaul Mullowney   PetscFunctionBegin;
241e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
242e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
243e057df02SPaul Mullowney   PetscFunctionReturn(0);
244e057df02SPaul Mullowney }
245e057df02SPaul Mullowney 
246365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
247365b711fSMark Adams {
248365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
249365b711fSMark Adams 
250365b711fSMark Adams   PetscFunctionBegin;
251365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
252365b711fSMark Adams   PetscFunctionReturn(0);
253365b711fSMark Adams }
254365b711fSMark Adams 
255365b711fSMark Adams /*@
256365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
257365b711fSMark Adams 
258365b711fSMark Adams    Input Parameters:
259365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
260365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
261365b711fSMark Adams 
262365b711fSMark Adams    Output Parameter:
263365b711fSMark Adams 
264365b711fSMark Adams    Notes:
265365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
266365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
267365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
268365b711fSMark Adams 
269365b711fSMark Adams    Level: intermediate
270365b711fSMark Adams 
271365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
272365b711fSMark Adams @*/
273365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
274365b711fSMark Adams {
275365b711fSMark Adams   PetscErrorCode ierr;
276365b711fSMark Adams 
277365b711fSMark Adams   PetscFunctionBegin;
278365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
279365b711fSMark Adams   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
280365b711fSMark Adams   PetscFunctionReturn(0);
281365b711fSMark Adams }
282365b711fSMark Adams 
2831a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
284e6e9a74fSStefano Zampini {
285e6e9a74fSStefano Zampini   PetscErrorCode ierr;
286e6e9a74fSStefano Zampini 
287e6e9a74fSStefano Zampini   PetscFunctionBegin;
2881a2c6b5cSJunchao Zhang   switch (op) {
2891a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2901a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2911a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2921a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2931a2c6b5cSJunchao Zhang       break;
2941a2c6b5cSJunchao Zhang     default:
2951a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2961a2c6b5cSJunchao Zhang       break;
297e6e9a74fSStefano Zampini   }
298e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
299e6e9a74fSStefano Zampini }
300e6e9a74fSStefano Zampini 
301bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
302bddcd29dSMark Adams 
303bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
304bddcd29dSMark Adams {
305bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
306bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
307bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
308365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
309bddcd29dSMark Adams   PetscErrorCode ierr;
310bddcd29dSMark Adams 
311bddcd29dSMark Adams   PetscFunctionBegin;
312bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
313bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
314bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
315bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
316bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
317bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
318bddcd29dSMark Adams   if (row_identity && col_identity) {
319365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
320bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
321bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
322365b711fSMark Adams     }
323bddcd29dSMark Adams     B->ops->matsolve = NULL;
324bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
325bddcd29dSMark Adams   } else {
326365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
327bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
328bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
329365b711fSMark Adams     }
330bddcd29dSMark Adams     B->ops->matsolve = NULL;
331bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
332bddcd29dSMark Adams   }
333bddcd29dSMark Adams 
334bddcd29dSMark Adams   /* get the triangular factors */
335365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
336bddcd29dSMark Adams     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
337365b711fSMark Adams   }
338bddcd29dSMark Adams   PetscFunctionReturn(0);
339bddcd29dSMark Adams }
340bddcd29dSMark Adams 
3414416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
3429ae82921SPaul Mullowney {
3439ae82921SPaul Mullowney   PetscErrorCode           ierr;
344e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
3459ae82921SPaul Mullowney   PetscBool                flg;
346a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3476e111a19SKarl Rupp 
3489ae82921SPaul Mullowney   PetscFunctionBegin;
349e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
3509ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
351e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
352a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
353afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
354afb2bd1cSJunchao Zhang 
3554c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
356a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
357afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
358365b711fSMark Adams     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
359365b711fSMark Adams     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
360afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
361afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
362afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
363afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
364a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
365a435da06SStefano Zampini     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
366a435da06SStefano Zampini #else
367afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
368a435da06SStefano Zampini #endif
369afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
370afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
371afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
372afb2bd1cSJunchao Zhang 
373afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
374afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
375afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
376afb2bd1cSJunchao Zhang    #endif
3774c87dfd4SPaul Mullowney   }
3780af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3799ae82921SPaul Mullowney   PetscFunctionReturn(0);
3809ae82921SPaul Mullowney }
3819ae82921SPaul Mullowney 
3826fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3839ae82921SPaul Mullowney {
384da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3859ae82921SPaul Mullowney   PetscErrorCode               ierr;
3869ae82921SPaul Mullowney 
3879ae82921SPaul Mullowney   PetscFunctionBegin;
388da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3899ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3909ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3919ae82921SPaul Mullowney   PetscFunctionReturn(0);
3929ae82921SPaul Mullowney }
3939ae82921SPaul Mullowney 
3946fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3959ae82921SPaul Mullowney {
396da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3979ae82921SPaul Mullowney   PetscErrorCode               ierr;
3989ae82921SPaul Mullowney 
3999ae82921SPaul Mullowney   PetscFunctionBegin;
400da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
4019ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
4029ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
4039ae82921SPaul Mullowney   PetscFunctionReturn(0);
4049ae82921SPaul Mullowney }
4059ae82921SPaul Mullowney 
406087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
407087f3262SPaul Mullowney {
408da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
409087f3262SPaul Mullowney   PetscErrorCode               ierr;
410087f3262SPaul Mullowney 
411087f3262SPaul Mullowney   PetscFunctionBegin;
412da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
413087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
414087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
415087f3262SPaul Mullowney   PetscFunctionReturn(0);
416087f3262SPaul Mullowney }
417087f3262SPaul Mullowney 
418087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
419087f3262SPaul Mullowney {
420da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
421087f3262SPaul Mullowney   PetscErrorCode               ierr;
422087f3262SPaul Mullowney 
423087f3262SPaul Mullowney   PetscFunctionBegin;
424da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
425087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
426087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
427087f3262SPaul Mullowney   PetscFunctionReturn(0);
428087f3262SPaul Mullowney }
429087f3262SPaul Mullowney 
430087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
4319ae82921SPaul Mullowney {
4329ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4339ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4349ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
435aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
4369ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4379ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
4389ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4399ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
4409ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
441b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
44257d48284SJunchao Zhang   cudaError_t                       cerr;
4439ae82921SPaul Mullowney 
4449ae82921SPaul Mullowney   PetscFunctionBegin;
445cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
446c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4479ae82921SPaul Mullowney     try {
4489ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
4499ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
450da79fbbcSStefano Zampini       if (!loTriFactor) {
4512cbc15d9SMark         PetscScalar                       *AALo;
4522cbc15d9SMark 
4532cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4549ae82921SPaul Mullowney 
4559ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
45657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
45757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
4589ae82921SPaul Mullowney 
4599ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4609ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4619ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4629ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4639ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4649ae82921SPaul Mullowney         v        = aa;
4659ae82921SPaul Mullowney         vi       = aj;
4669ae82921SPaul Mullowney         offset   = 1;
4679ae82921SPaul Mullowney         rowOffset= 1;
4689ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4699ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
470e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4719ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4729ae82921SPaul Mullowney           rowOffset += nz+1;
4739ae82921SPaul Mullowney 
474580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
475580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4769ae82921SPaul Mullowney 
4779ae82921SPaul Mullowney           offset      += nz;
4789ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4799ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4809ae82921SPaul Mullowney           offset      += 1;
4819ae82921SPaul Mullowney 
4829ae82921SPaul Mullowney           v  += nz;
4839ae82921SPaul Mullowney           vi += nz;
4849ae82921SPaul Mullowney         }
4852205254eSKarl Rupp 
486aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
487da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
488da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
489aa372e3fSPaul Mullowney         /* Create the matrix description */
49057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
49157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4921b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
493afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
494afb2bd1cSJunchao Zhang        #else
49557d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
496afb2bd1cSJunchao Zhang        #endif
49757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
49857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
499aa372e3fSPaul Mullowney 
500aa372e3fSPaul Mullowney         /* set the operation */
501aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
502aa372e3fSPaul Mullowney 
503aa372e3fSPaul Mullowney         /* set the matrix */
504aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
505aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
506aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
507aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
508aa372e3fSPaul Mullowney 
509aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
510aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
511aa372e3fSPaul Mullowney 
512aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
513aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
514aa372e3fSPaul Mullowney 
515aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
516aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
517aa372e3fSPaul Mullowney 
518afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
519da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
520afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5211b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
522afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
523afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
524afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
525afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
526afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
527afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
528afb2bd1cSJunchao Zhang       #endif
529afb2bd1cSJunchao Zhang 
530aa372e3fSPaul Mullowney         /* perform the solve analysis */
531aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
532aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
533aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
534d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
5351b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
536d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
537d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
538d49cd2b7SBarry Smith                                #else
539d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
540afb2bd1cSJunchao Zhang                                #endif
541da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
542da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
543aa372e3fSPaul Mullowney 
544da79fbbcSStefano Zampini         /* assign the pointer */
545aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
5462cbc15d9SMark         loTriFactor->AA_h = AALo;
54757d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
54857d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
5494863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
550da79fbbcSStefano Zampini       } else { /* update values only */
5512cbc15d9SMark         if (!loTriFactor->AA_h) {
5522cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
5532cbc15d9SMark         }
554da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
5552cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
556da79fbbcSStefano Zampini         v        = aa;
557da79fbbcSStefano Zampini         vi       = aj;
558da79fbbcSStefano Zampini         offset   = 1;
559da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
560da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5612cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
562da79fbbcSStefano Zampini           offset      += nz;
5632cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
564da79fbbcSStefano Zampini           offset      += 1;
565da79fbbcSStefano Zampini           v  += nz;
566da79fbbcSStefano Zampini         }
5672cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
568da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
569da79fbbcSStefano Zampini       }
5709ae82921SPaul Mullowney     } catch(char *ex) {
5719ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5729ae82921SPaul Mullowney     }
5739ae82921SPaul Mullowney   }
5749ae82921SPaul Mullowney   PetscFunctionReturn(0);
5759ae82921SPaul Mullowney }
5769ae82921SPaul Mullowney 
577087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5789ae82921SPaul Mullowney {
5799ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5809ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5819ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
582aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5839ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5849ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5859ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5869ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5879ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5889ae82921SPaul Mullowney   PetscErrorCode                    ierr;
58957d48284SJunchao Zhang   cudaError_t                       cerr;
5909ae82921SPaul Mullowney 
5919ae82921SPaul Mullowney   PetscFunctionBegin;
592cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
593c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5949ae82921SPaul Mullowney     try {
5959ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5969ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
597da79fbbcSStefano Zampini       if (!upTriFactor) {
5982cbc15d9SMark         PetscScalar *AAUp;
5992cbc15d9SMark 
6002cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6012cbc15d9SMark 
6029ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
60357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
60457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
6059ae82921SPaul Mullowney 
6069ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
6079ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
6089ae82921SPaul Mullowney         AiUp[n]=nzUpper;
6099ae82921SPaul Mullowney         offset = nzUpper;
6109ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
6119ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
6129ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
6139ae82921SPaul Mullowney 
614e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
6159ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
6169ae82921SPaul Mullowney 
617e057df02SPaul Mullowney           /* decrement the offset */
6189ae82921SPaul Mullowney           offset -= (nz+1);
6199ae82921SPaul Mullowney 
620e057df02SPaul Mullowney           /* first, set the diagonal elements */
6219ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
62209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
6239ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
6249ae82921SPaul Mullowney 
625580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
626580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
6279ae82921SPaul Mullowney         }
6282205254eSKarl Rupp 
629aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
630da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
631da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
6322205254eSKarl Rupp 
633aa372e3fSPaul Mullowney         /* Create the matrix description */
63457d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
63557d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
6361b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
637afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
638afb2bd1cSJunchao Zhang        #else
63957d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
640afb2bd1cSJunchao Zhang        #endif
64157d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
64257d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
643aa372e3fSPaul Mullowney 
644aa372e3fSPaul Mullowney         /* set the operation */
645aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
646aa372e3fSPaul Mullowney 
647aa372e3fSPaul Mullowney         /* set the matrix */
648aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
649aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
650aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
651aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
652aa372e3fSPaul Mullowney 
653aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
654aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
655aa372e3fSPaul Mullowney 
656aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
657aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
658aa372e3fSPaul Mullowney 
659aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
660aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
661aa372e3fSPaul Mullowney 
662afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
663da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
664afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6651b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
666afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
667afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
668afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
669afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
670afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
671afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
672afb2bd1cSJunchao Zhang       #endif
673afb2bd1cSJunchao Zhang 
674aa372e3fSPaul Mullowney         /* perform the solve analysis */
675aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
676aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
677aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
678d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6791b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
680d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
681d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
682d49cd2b7SBarry Smith                                #else
683d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
684afb2bd1cSJunchao Zhang                                #endif
685da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
686da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
687aa372e3fSPaul Mullowney 
688da79fbbcSStefano Zampini         /* assign the pointer */
689aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6902cbc15d9SMark         upTriFactor->AA_h = AAUp;
69157d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
69257d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6934863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
694da79fbbcSStefano Zampini       } else {
6952cbc15d9SMark         if (!upTriFactor->AA_h) {
6962cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6972cbc15d9SMark         }
698da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
699da79fbbcSStefano Zampini         offset = nzUpper;
700da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
701da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
702da79fbbcSStefano Zampini 
703da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
704da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
705da79fbbcSStefano Zampini 
706da79fbbcSStefano Zampini           /* decrement the offset */
707da79fbbcSStefano Zampini           offset -= (nz+1);
708da79fbbcSStefano Zampini 
709da79fbbcSStefano Zampini           /* first, set the diagonal elements */
7102cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
7112cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
712da79fbbcSStefano Zampini         }
7132cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
714da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
715da79fbbcSStefano Zampini       }
7169ae82921SPaul Mullowney     } catch(char *ex) {
7179ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7189ae82921SPaul Mullowney     }
7199ae82921SPaul Mullowney   }
7209ae82921SPaul Mullowney   PetscFunctionReturn(0);
7219ae82921SPaul Mullowney }
7229ae82921SPaul Mullowney 
723087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
7249ae82921SPaul Mullowney {
7259ae82921SPaul Mullowney   PetscErrorCode               ierr;
7269ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
7279ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7289ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
7299ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
7309ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
7319ae82921SPaul Mullowney 
7329ae82921SPaul Mullowney   PetscFunctionBegin;
733da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
734087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
735087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
7362205254eSKarl Rupp 
737da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
738aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
7399ae82921SPaul Mullowney 
740c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
741e057df02SPaul Mullowney   /* lower triangular indices */
7429ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
743da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
744da79fbbcSStefano Zampini     const PetscInt *r;
745da79fbbcSStefano Zampini 
746da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
747aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
748aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
7499ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
750da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
751da79fbbcSStefano Zampini   }
7529ae82921SPaul Mullowney 
753e057df02SPaul Mullowney   /* upper triangular indices */
7549ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
755da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
756da79fbbcSStefano Zampini     const PetscInt *c;
757da79fbbcSStefano Zampini 
758da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
759aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
760aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
7619ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
762da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
763da79fbbcSStefano Zampini   }
7649ae82921SPaul Mullowney   PetscFunctionReturn(0);
7659ae82921SPaul Mullowney }
7669ae82921SPaul Mullowney 
767087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
768087f3262SPaul Mullowney {
769087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
770087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
771aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
772aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
773087f3262SPaul Mullowney   cusparseStatus_t                  stat;
774087f3262SPaul Mullowney   PetscErrorCode                    ierr;
77557d48284SJunchao Zhang   cudaError_t                       cerr;
776087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
777087f3262SPaul Mullowney   PetscScalar                       *AAUp;
778087f3262SPaul Mullowney   PetscScalar                       *AALo;
779087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
780087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
781087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
782087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
783087f3262SPaul Mullowney 
784087f3262SPaul Mullowney   PetscFunctionBegin;
785cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
786c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
787087f3262SPaul Mullowney     try {
788da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
789da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
790da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
791087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
79257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
79357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
794087f3262SPaul Mullowney 
795087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
796087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
797087f3262SPaul Mullowney         AiUp[n]=nzUpper;
798087f3262SPaul Mullowney         offset = 0;
799087f3262SPaul Mullowney         for (i=0; i<n; i++) {
800087f3262SPaul Mullowney           /* set the pointers */
801087f3262SPaul Mullowney           v  = aa + ai[i];
802087f3262SPaul Mullowney           vj = aj + ai[i];
803087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
804087f3262SPaul Mullowney 
805087f3262SPaul Mullowney           /* first, set the diagonal elements */
806087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
80709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
808087f3262SPaul Mullowney           AiUp[i]      = offset;
80909f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
810087f3262SPaul Mullowney 
811087f3262SPaul Mullowney           offset+=1;
812087f3262SPaul Mullowney           if (nz>0) {
813f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
814580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
815087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
816087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
817087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
818087f3262SPaul Mullowney             }
819087f3262SPaul Mullowney             offset+=nz;
820087f3262SPaul Mullowney           }
821087f3262SPaul Mullowney         }
822087f3262SPaul Mullowney 
823aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
824da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
825da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
826087f3262SPaul Mullowney 
827aa372e3fSPaul Mullowney         /* Create the matrix description */
82857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
82957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8301b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
831afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
832afb2bd1cSJunchao Zhang        #else
83357d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
834afb2bd1cSJunchao Zhang        #endif
83557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
837087f3262SPaul Mullowney 
838aa372e3fSPaul Mullowney         /* set the matrix */
839aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
840aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
841aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
842aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
843aa372e3fSPaul Mullowney 
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
852aa372e3fSPaul Mullowney 
853afb2bd1cSJunchao Zhang         /* set the operation */
854afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
855afb2bd1cSJunchao Zhang 
856afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
857da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
858afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8591b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
860afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
861afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
862afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
863afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
864afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
865afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
866afb2bd1cSJunchao Zhang       #endif
867afb2bd1cSJunchao Zhang 
868aa372e3fSPaul Mullowney         /* perform the solve analysis */
869aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
870aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
871aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
872d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8731b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
874d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
875d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
876d49cd2b7SBarry Smith                                 #else
877d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
878afb2bd1cSJunchao Zhang                                 #endif
879da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
880da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
881aa372e3fSPaul Mullowney 
882da79fbbcSStefano Zampini         /* assign the pointer */
883aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
884aa372e3fSPaul Mullowney 
885aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
886da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
887da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
888aa372e3fSPaul Mullowney 
889aa372e3fSPaul Mullowney         /* Create the matrix description */
89057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
89157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8921b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
893afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
894afb2bd1cSJunchao Zhang        #else
89557d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
896afb2bd1cSJunchao Zhang        #endif
89757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
89857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
899aa372e3fSPaul Mullowney 
900aa372e3fSPaul Mullowney         /* set the operation */
901aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
902aa372e3fSPaul Mullowney 
903aa372e3fSPaul Mullowney         /* set the matrix */
904aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
907aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
908aa372e3fSPaul Mullowney 
909aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
910aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
911aa372e3fSPaul Mullowney 
912aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
913aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
914aa372e3fSPaul Mullowney 
915aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
916aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
917aa372e3fSPaul Mullowney 
918afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
919da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
920afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
9211b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
922afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
923afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
924afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
925afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
926afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
927afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
928afb2bd1cSJunchao Zhang       #endif
929afb2bd1cSJunchao Zhang 
930aa372e3fSPaul Mullowney         /* perform the solve analysis */
931aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
932aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
933aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
934d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
9351b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
936d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
937d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
938d49cd2b7SBarry Smith                                 #else
939d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
940afb2bd1cSJunchao Zhang                                 #endif
941da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
942da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
943aa372e3fSPaul Mullowney 
944da79fbbcSStefano Zampini         /* assign the pointer */
945aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
946087f3262SPaul Mullowney 
947da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
94857d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
94957d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
950da79fbbcSStefano Zampini       } else {
951da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
952da79fbbcSStefano Zampini         offset = 0;
953da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
954da79fbbcSStefano Zampini           /* set the pointers */
955da79fbbcSStefano Zampini           v  = aa + ai[i];
956da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
957da79fbbcSStefano Zampini 
958da79fbbcSStefano Zampini           /* first, set the diagonal elements */
959da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
960da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
961da79fbbcSStefano Zampini 
962da79fbbcSStefano Zampini           offset+=1;
963da79fbbcSStefano Zampini           if (nz>0) {
964da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
965da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
966da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
967da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
968da79fbbcSStefano Zampini             }
969da79fbbcSStefano Zampini             offset+=nz;
970da79fbbcSStefano Zampini           }
971da79fbbcSStefano Zampini         }
972da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
973da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
974da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
975da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
976da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
977da79fbbcSStefano Zampini       }
97857d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
97957d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
980087f3262SPaul Mullowney     } catch(char *ex) {
981087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
982087f3262SPaul Mullowney     }
983087f3262SPaul Mullowney   }
984087f3262SPaul Mullowney   PetscFunctionReturn(0);
985087f3262SPaul Mullowney }
986087f3262SPaul Mullowney 
987087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9889ae82921SPaul Mullowney {
9899ae82921SPaul Mullowney   PetscErrorCode               ierr;
990087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
991087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
992087f3262SPaul Mullowney   IS                           ip = a->row;
993087f3262SPaul Mullowney   PetscBool                    perm_identity;
994087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
995087f3262SPaul Mullowney 
996087f3262SPaul Mullowney   PetscFunctionBegin;
997da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
998087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
999da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
1000aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
1001aa372e3fSPaul Mullowney 
1002da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
1003da79fbbcSStefano Zampini 
1004087f3262SPaul Mullowney   /* lower triangular indices */
1005087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1006087f3262SPaul Mullowney   if (!perm_identity) {
10074e4bbfaaSStefano Zampini     IS             iip;
1008da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
10094e4bbfaaSStefano Zampini 
10104e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
10114e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1012da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1013aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1014aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1015aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10164e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
10174e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
10184e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1019087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1020da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1021da79fbbcSStefano Zampini   }
1022087f3262SPaul Mullowney   PetscFunctionReturn(0);
1023087f3262SPaul Mullowney }
1024087f3262SPaul Mullowney 
1025087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1026087f3262SPaul Mullowney {
1027087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1028087f3262SPaul Mullowney   IS             ip = b->row;
1029087f3262SPaul Mullowney   PetscBool      perm_identity;
1030b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1031087f3262SPaul Mullowney 
1032087f3262SPaul Mullowney   PetscFunctionBegin;
103357181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1034087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1035ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1036087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1037087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1038087f3262SPaul Mullowney   if (perm_identity) {
1039087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1040087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
10414e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10424e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1043087f3262SPaul Mullowney   } else {
1044087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1045087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
10464e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10474e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1048087f3262SPaul Mullowney   }
1049087f3262SPaul Mullowney 
1050087f3262SPaul Mullowney   /* get the triangular factors */
1051087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1052087f3262SPaul Mullowney   PetscFunctionReturn(0);
1053087f3262SPaul Mullowney }
10549ae82921SPaul Mullowney 
1055b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1056bda325fcSPaul Mullowney {
1057bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1058aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1059aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1060da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1061da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1062bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1063aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1064aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1065aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1066aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10671b0a6780SStefano Zampini   cudaError_t                       cerr;
1068da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1069b175d8bbSPaul Mullowney 
1070bda325fcSPaul Mullowney   PetscFunctionBegin;
1071aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1072da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1073da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1074aa372e3fSPaul Mullowney 
1075aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1076aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1077aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1078aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1079aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1080aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1081aa372e3fSPaul Mullowney 
1082aa372e3fSPaul Mullowney   /* Create the matrix description */
108357d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
108457d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
108557d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
108657d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
108757d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /* set the operation */
1090aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1091aa372e3fSPaul Mullowney 
1092aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1093aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1094afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1095afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1096aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1097afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1098afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1099afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1100aa372e3fSPaul Mullowney 
1101aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1102afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1103afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1104afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1105afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1106afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1107afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1108afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1109afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1110afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1111afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
11121b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1113afb2bd1cSJunchao Zhang #endif
1114afb2bd1cSJunchao Zhang 
1115da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1116aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1117aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1118aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1119aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1120aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1121aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1122afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1123afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1124afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1125d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1126afb2bd1cSJunchao Zhang                         #else
1127afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1128d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1129afb2bd1cSJunchao Zhang                         #endif
1130da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1131da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1132aa372e3fSPaul Mullowney 
1133afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1134da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1135afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11361b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1137afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1138afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1139afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1140afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1141afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1142afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1143afb2bd1cSJunchao Zhang #endif
1144afb2bd1cSJunchao Zhang 
1145afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1146aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1147afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1148afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1149d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
11501b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1151d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1152d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1153d49cd2b7SBarry Smith                           #else
1154d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1155afb2bd1cSJunchao Zhang                           #endif
1156da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1157da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158aa372e3fSPaul Mullowney 
1159da79fbbcSStefano Zampini   /* assign the pointer */
1160aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1161aa372e3fSPaul Mullowney 
1162aa372e3fSPaul Mullowney   /*********************************************/
1163aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1164aa372e3fSPaul Mullowney   /*********************************************/
1165aa372e3fSPaul Mullowney 
1166aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1167da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1168da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1169aa372e3fSPaul Mullowney 
1170aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1171aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1172aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1173aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1174aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1175aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1176aa372e3fSPaul Mullowney 
1177aa372e3fSPaul Mullowney   /* Create the matrix description */
117857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
117957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
118057d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
118157d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
118257d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1183aa372e3fSPaul Mullowney 
1184aa372e3fSPaul Mullowney   /* set the operation */
1185aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1186aa372e3fSPaul Mullowney 
1187aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1188aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1189afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1190afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1191aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1192afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1193afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1194afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1195aa372e3fSPaul Mullowney 
1196aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1197afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1198afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1199afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1200afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1201afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1202afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1203afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1204afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1205afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1206afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1207afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1208afb2bd1cSJunchao Zhang #endif
1209afb2bd1cSJunchao Zhang 
1210da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1211aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1212aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1213aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1214aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1215aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1216aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1217afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1218afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1219afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1220d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1221afb2bd1cSJunchao Zhang                         #else
1222afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1223d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1224afb2bd1cSJunchao Zhang                         #endif
1225d49cd2b7SBarry Smith 
1226da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1227da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1228aa372e3fSPaul Mullowney 
1229afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1230da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1231afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
12321b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1233afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1234afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1235afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1236afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1237afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1238afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1239afb2bd1cSJunchao Zhang   #endif
1240afb2bd1cSJunchao Zhang 
1241afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1242aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1243afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1244afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1245d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
12461b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1247d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1248d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1249d49cd2b7SBarry Smith                           #else
1250d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1251afb2bd1cSJunchao Zhang                           #endif
1252d49cd2b7SBarry Smith 
1253da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1254da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1255aa372e3fSPaul Mullowney 
1256da79fbbcSStefano Zampini   /* assign the pointer */
1257aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1258bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1259bda325fcSPaul Mullowney }
1260bda325fcSPaul Mullowney 
1261a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1262a49f1ed0SStefano Zampini {
1263a49f1ed0SStefano Zampini   __host__ __device__
1264a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1265a49f1ed0SStefano Zampini   {
1266a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1267a49f1ed0SStefano Zampini   }
1268a49f1ed0SStefano Zampini };
1269a49f1ed0SStefano Zampini 
12703606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1271bda325fcSPaul Mullowney {
1272aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1273a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1274bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1275bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1276aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1277b06137fdSPaul Mullowney   cudaError_t                  err;
127885ba7357SStefano Zampini   PetscErrorCode               ierr;
1279b175d8bbSPaul Mullowney 
1280bda325fcSPaul Mullowney   PetscFunctionBegin;
1281a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1282a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1283e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1284a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1285e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12861a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
128785ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1288ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1289a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1290a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1291a49f1ed0SStefano Zampini   }
1292a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1293aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
129457d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1295aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
129657d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
129757d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1298aa372e3fSPaul Mullowney 
1299b06137fdSPaul Mullowney     /* set alpha and beta */
1300afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
13017656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
13027656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1303afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13047656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13057656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1306b06137fdSPaul Mullowney 
1307aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1308aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1309a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1310554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1311554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1312aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1313a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1314aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1315aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1316a3fdcf43SKarl Rupp 
1317039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
131881902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1319afb2bd1cSJunchao Zhang 
1320afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13213606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1322afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1323afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1324afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1325afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1326afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1327afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13283606e59fSJunchao Zhang       #else
13293606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
13303606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
13313606e59fSJunchao Zhang 
13323606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
13333606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
13343606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
13353606e59fSJunchao Zhang         */
13363606e59fSJunchao Zhang         if (matrixT->num_entries) {
13373606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
13383606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
13393606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
13403606e59fSJunchao Zhang                                  matrixT->values->data().get(),
13413606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
13423606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13433606e59fSJunchao Zhang 
13443606e59fSJunchao Zhang         } else {
13453606e59fSJunchao Zhang           matstructT->matDescr = NULL;
13463606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13473606e59fSJunchao Zhang         }
13483606e59fSJunchao Zhang       #endif
1349afb2bd1cSJunchao Zhang      #endif
1350aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1351afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1352afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1353afb2bd1cSJunchao Zhang    #else
1354aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
135551c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
135651c6d536SStefano Zampini       /* First convert HYB to CSR */
1357aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1358aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1359aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1360aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1361aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1362aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1363aa372e3fSPaul Mullowney 
1364aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1365aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1366aa372e3fSPaul Mullowney                               temp->values->data().get(),
1367aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
136857d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1369aa372e3fSPaul Mullowney 
1370aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1371aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1372aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1373aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1374aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1375aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1376aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1377aa372e3fSPaul Mullowney 
1378aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1379aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1380aa372e3fSPaul Mullowney                               temp->values->data().get(),
1381aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1382aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1383aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1384aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1385aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
138657d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1387aa372e3fSPaul Mullowney 
1388aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1389aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
139057d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1391aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1392aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1393aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1394aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1395aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1396aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
139757d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1398aa372e3fSPaul Mullowney 
1399aa372e3fSPaul Mullowney       /* assign the pointer */
1400aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
14011a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1402aa372e3fSPaul Mullowney       /* delete temporaries */
1403aa372e3fSPaul Mullowney       if (tempT) {
1404aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1405aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1406aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1407aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1408087f3262SPaul Mullowney       }
1409aa372e3fSPaul Mullowney       if (temp) {
1410aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1411aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1412aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1413aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1414aa372e3fSPaul Mullowney       }
1415afb2bd1cSJunchao Zhang      #endif
1416aa372e3fSPaul Mullowney     }
1417a49f1ed0SStefano Zampini   }
1418a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1419a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1420a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1421e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1422e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1423e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1424e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1425e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1426e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1427e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1428e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1429a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1430a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1431a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1432a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1433a49f1ed0SStefano Zampini     }
1434a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1435a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1436a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1437a49f1ed0SStefano Zampini 
1438a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1439a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1440a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1441a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1442a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1443a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1444a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1445a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1446a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1447a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1448a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1449a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1450a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1451a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1452a49f1ed0SStefano Zampini      #endif
1453a49f1ed0SStefano Zampini 
14541a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
14551a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
14561a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
14571a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
14581a2c6b5cSJunchao Zhang 
14591a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
14601a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
14611a2c6b5cSJunchao Zhang         */
14621a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14631a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14641a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14651a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14661a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1467a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1468a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1469a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1470a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14711a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1472a49f1ed0SStefano Zampini                              #else
1473a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14741a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1475a49f1ed0SStefano Zampini                              #endif
14761a2c6b5cSJunchao Zhang       } else {
14771a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14781a2c6b5cSJunchao Zhang       }
14791a2c6b5cSJunchao Zhang 
1480a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1481a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1482a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1483a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1484a49f1ed0SStefano Zampini      #endif
1485a49f1ed0SStefano Zampini     }
1486a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1487a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1488a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1489a49f1ed0SStefano Zampini   }
1490ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
149185ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1492213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1493213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1494aa372e3fSPaul Mullowney   /* assign the pointer */
1495aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14961a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1497bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1498bda325fcSPaul Mullowney }
1499bda325fcSPaul Mullowney 
1500a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
15016fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1502bda325fcSPaul Mullowney {
1503c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1504465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1505465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1506465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1507465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1508bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1509bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1510aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1511aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1512aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1513b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1514bda325fcSPaul Mullowney 
1515bda325fcSPaul Mullowney   PetscFunctionBegin;
1516aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1517aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1518bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1519aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1520aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1521bda325fcSPaul Mullowney   }
1522bda325fcSPaul Mullowney 
1523bda325fcSPaul Mullowney   /* Get the GPU pointers */
1524c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1525c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1526c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1527c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1528bda325fcSPaul Mullowney 
15297a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1530aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1531a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1532c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1533c41cb2e2SAlejandro Lamas Daviña                xGPU);
1534aa372e3fSPaul Mullowney 
1535aa372e3fSPaul Mullowney   /* First, solve U */
1536aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1537afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15381b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1540afb2bd1cSJunchao Zhang                       #endif
1541afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1542aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1543aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1544aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1545aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1546d49cd2b7SBarry Smith                         xarray,
15471b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548d49cd2b7SBarry Smith                         tempGPU->data().get(),
1549d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1550d49cd2b7SBarry Smith                       #else
1551d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1552afb2bd1cSJunchao Zhang                       #endif
1553aa372e3fSPaul Mullowney 
1554aa372e3fSPaul Mullowney   /* Then, solve L */
1555aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1556afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15571b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1558afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1559afb2bd1cSJunchao Zhang                       #endif
1560afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1561aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1562aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1563aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1564aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1565d49cd2b7SBarry Smith                         tempGPU->data().get(),
15661b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1567d49cd2b7SBarry Smith                         xarray,
1568d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1569d49cd2b7SBarry Smith                       #else
1570d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1571afb2bd1cSJunchao Zhang                       #endif
1572aa372e3fSPaul Mullowney 
1573aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1574a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1575c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1576aa372e3fSPaul Mullowney                tempGPU->begin());
1577aa372e3fSPaul Mullowney 
1578aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1579a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1580bda325fcSPaul Mullowney 
1581bda325fcSPaul Mullowney   /* restore */
1582c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1583c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1584661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1585958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1586bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1587bda325fcSPaul Mullowney }
1588bda325fcSPaul Mullowney 
15896fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1590bda325fcSPaul Mullowney {
1591465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1592465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1593bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1594bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1595aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1596aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1597aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1598b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1599bda325fcSPaul Mullowney 
1600bda325fcSPaul Mullowney   PetscFunctionBegin;
1601aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1602aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1603bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1604aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1605aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1606bda325fcSPaul Mullowney   }
1607bda325fcSPaul Mullowney 
1608bda325fcSPaul Mullowney   /* Get the GPU pointers */
1609c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1610c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1611bda325fcSPaul Mullowney 
16127a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1613aa372e3fSPaul Mullowney   /* First, solve U */
1614aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1615afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
16161b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1618afb2bd1cSJunchao Zhang                       #endif
1619afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1620aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1621aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1622aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1623aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1624d49cd2b7SBarry Smith                         barray,
16251b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1626d49cd2b7SBarry Smith                         tempGPU->data().get(),
1627d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1628d49cd2b7SBarry Smith                       #else
1629d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1630afb2bd1cSJunchao Zhang                       #endif
1631aa372e3fSPaul Mullowney 
1632aa372e3fSPaul Mullowney   /* Then, solve L */
1633aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1634afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
16351b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1636afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1637afb2bd1cSJunchao Zhang                       #endif
1638afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1639aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1640aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1641aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1642aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1643d49cd2b7SBarry Smith                         tempGPU->data().get(),
16441b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1645d49cd2b7SBarry Smith                         xarray,
1646d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1647d49cd2b7SBarry Smith                       #else
1648d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1649afb2bd1cSJunchao Zhang                       #endif
1650bda325fcSPaul Mullowney 
1651bda325fcSPaul Mullowney   /* restore */
1652c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1653c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1654661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1655958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1656bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1657bda325fcSPaul Mullowney }
1658bda325fcSPaul Mullowney 
16596fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
16609ae82921SPaul Mullowney {
1661465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1662465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1663465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1664465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16659ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16669ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1667aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1668aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1669aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1670b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
16719ae82921SPaul Mullowney 
16729ae82921SPaul Mullowney   PetscFunctionBegin;
1673ebc8f436SDominic Meiser 
1674e057df02SPaul Mullowney   /* Get the GPU pointers */
1675c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1676c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1677c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1678c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16799ae82921SPaul Mullowney 
16807a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1681aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1682a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1683c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16844e4bbfaaSStefano Zampini                tempGPU->begin());
1685aa372e3fSPaul Mullowney 
1686aa372e3fSPaul Mullowney   /* Next, solve L */
1687aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1688afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16891b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1690afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1691afb2bd1cSJunchao Zhang                       #endif
1692afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1693aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1694aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1695aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1696aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1697d49cd2b7SBarry Smith                         tempGPU->data().get(),
16981b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1699d49cd2b7SBarry Smith                          xarray,
1700d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1701d49cd2b7SBarry Smith                       #else
1702d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1703afb2bd1cSJunchao Zhang                       #endif
1704aa372e3fSPaul Mullowney 
1705aa372e3fSPaul Mullowney   /* Then, solve U */
1706aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1707afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17081b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1709afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1710afb2bd1cSJunchao Zhang                       #endif
1711afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1712aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1713aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1714aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1715d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
17161b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1717d49cd2b7SBarry Smith                         tempGPU->data().get(),
1718d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1719d49cd2b7SBarry Smith                       #else
1720d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1721afb2bd1cSJunchao Zhang                       #endif
1722d49cd2b7SBarry Smith 
17234e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1724a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
17254e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
17264e4bbfaaSStefano Zampini                xGPU);
17279ae82921SPaul Mullowney 
1728c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1729c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1730661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1731958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17329ae82921SPaul Mullowney   PetscFunctionReturn(0);
17339ae82921SPaul Mullowney }
17349ae82921SPaul Mullowney 
17356fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
17369ae82921SPaul Mullowney {
1737465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1738465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
17399ae82921SPaul Mullowney   cusparseStatus_t                  stat;
17409ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1741aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1742aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1743aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1744b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
17459ae82921SPaul Mullowney 
17469ae82921SPaul Mullowney   PetscFunctionBegin;
1747e057df02SPaul Mullowney   /* Get the GPU pointers */
1748c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1749c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
17509ae82921SPaul Mullowney 
17517a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1752aa372e3fSPaul Mullowney   /* First, solve L */
1753aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1754afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
17551b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1756afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1757afb2bd1cSJunchao Zhang                       #endif
1758afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1759aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1760aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1761aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1762aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1763d49cd2b7SBarry Smith                         barray,
17641b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1765d49cd2b7SBarry Smith                         tempGPU->data().get(),
1766d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1767d49cd2b7SBarry Smith                       #else
1768d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1769afb2bd1cSJunchao Zhang                       #endif
1770d49cd2b7SBarry Smith 
1771aa372e3fSPaul Mullowney   /* Next, solve U */
1772aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1773afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17741b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1775afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1776afb2bd1cSJunchao Zhang                       #endif
1777afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1778aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1779aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1780aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1781aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1782d49cd2b7SBarry Smith                         tempGPU->data().get(),
17831b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1784d49cd2b7SBarry Smith                         xarray,
1785d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1786d49cd2b7SBarry Smith                       #else
1787d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1788afb2bd1cSJunchao Zhang                       #endif
17899ae82921SPaul Mullowney 
1790c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1791c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1792661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1793958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17949ae82921SPaul Mullowney   PetscFunctionReturn(0);
17959ae82921SPaul Mullowney }
17969ae82921SPaul Mullowney 
17977e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17987e8381f9SStefano Zampini {
17997e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
18007e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
18017e8381f9SStefano Zampini   cudaError_t        cerr;
18027e8381f9SStefano Zampini   PetscErrorCode     ierr;
18037e8381f9SStefano Zampini 
18047e8381f9SStefano Zampini   PetscFunctionBegin;
18057e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
18067e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
18077e8381f9SStefano Zampini 
18087e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18097e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
18107e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
18117e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
18127e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18137e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
18147e8381f9SStefano Zampini   }
18157e8381f9SStefano Zampini   PetscFunctionReturn(0);
18167e8381f9SStefano Zampini }
18177e8381f9SStefano Zampini 
18187e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
18197e8381f9SStefano Zampini {
18207e8381f9SStefano Zampini   PetscErrorCode ierr;
18217e8381f9SStefano Zampini 
18227e8381f9SStefano Zampini   PetscFunctionBegin;
18237e8381f9SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
182467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
182567a45760SJunchao Zhang   PetscFunctionReturn(0);
182667a45760SJunchao Zhang }
182767a45760SJunchao Zhang 
182867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
182967a45760SJunchao Zhang {
183067a45760SJunchao Zhang   PetscFunctionBegin;
18317e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
183267a45760SJunchao Zhang   *array         = NULL;
183367a45760SJunchao Zhang   PetscFunctionReturn(0);
183467a45760SJunchao Zhang }
183567a45760SJunchao Zhang 
183667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
183767a45760SJunchao Zhang {
183867a45760SJunchao Zhang   PetscErrorCode ierr;
183967a45760SJunchao Zhang 
184067a45760SJunchao Zhang   PetscFunctionBegin;
184167a45760SJunchao Zhang   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
184267a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
184367a45760SJunchao Zhang   PetscFunctionReturn(0);
184467a45760SJunchao Zhang }
184567a45760SJunchao Zhang 
184667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
184767a45760SJunchao Zhang {
184867a45760SJunchao Zhang   PetscFunctionBegin;
184967a45760SJunchao Zhang   *array = NULL;
185067a45760SJunchao Zhang   PetscFunctionReturn(0);
185167a45760SJunchao Zhang }
185267a45760SJunchao Zhang 
185367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185467a45760SJunchao Zhang {
185567a45760SJunchao Zhang   PetscFunctionBegin;
185667a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
185767a45760SJunchao Zhang   PetscFunctionReturn(0);
185867a45760SJunchao Zhang }
185967a45760SJunchao Zhang 
186067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
186167a45760SJunchao Zhang {
186267a45760SJunchao Zhang   PetscFunctionBegin;
186367a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
186467a45760SJunchao Zhang   *array         = NULL;
18657e8381f9SStefano Zampini   PetscFunctionReturn(0);
18667e8381f9SStefano Zampini }
18677e8381f9SStefano Zampini 
1868042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18699ae82921SPaul Mullowney {
1870aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18717c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18729ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1873213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
18749ae82921SPaul Mullowney   PetscErrorCode               ierr;
1875aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1876abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1877b06137fdSPaul Mullowney   cudaError_t                  err;
18789ae82921SPaul Mullowney 
18799ae82921SPaul Mullowney   PetscFunctionBegin;
1880e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1881c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1882a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1883a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1884afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
188585ba7357SStefano Zampini 
1886e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
188785ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1888afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
188905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
18904863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
189185ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1892a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
189334d6c7a5SJose E. Roman     } else {
1894abb89eb1SStefano Zampini       PetscInt nnz;
189585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
18967c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1897a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
18987c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
189981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1900a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1901a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
19029ae82921SPaul Mullowney       try {
19039ae82921SPaul Mullowney         if (a->compressedrow.use) {
19049ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
19059ae82921SPaul Mullowney           ii   = a->compressedrow.i;
19069ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
19079ae82921SPaul Mullowney         } else {
1908213423ffSJunchao Zhang           m    = A->rmap->n;
1909213423ffSJunchao Zhang           ii   = a->i;
1910e6e9a74fSStefano Zampini           ridx = NULL;
19119ae82921SPaul Mullowney         }
1912e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1913e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1914abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1915abb89eb1SStefano Zampini         else nnz = a->nz;
19169ae82921SPaul Mullowney 
191785ba7357SStefano Zampini         /* create cusparse matrix */
1918abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1919aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
192057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
192157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
192257d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
19239ae82921SPaul Mullowney 
1924afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
19257656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
19267656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1927afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19287656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19297656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
193057d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1931b06137fdSPaul Mullowney 
1932aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1933aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1934aa372e3fSPaul Mullowney           /* set the matrix */
1935afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1936afb2bd1cSJunchao Zhang           mat->num_rows = m;
1937afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1938abb89eb1SStefano Zampini           mat->num_entries = nnz;
1939afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1940afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
19419ae82921SPaul Mullowney 
1942abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1943abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1944aa372e3fSPaul Mullowney 
1945abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1946abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1947aa372e3fSPaul Mullowney 
1948aa372e3fSPaul Mullowney           /* assign the pointer */
1949afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1950afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1951afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1952afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1953afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1954afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1955afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1956afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1957afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1958afb2bd1cSJunchao Zhang           }
1959afb2bd1cSJunchao Zhang          #endif
1960aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1961afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1962afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1963afb2bd1cSJunchao Zhang          #else
1964afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1965afb2bd1cSJunchao Zhang           mat->num_rows = m;
1966afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1967abb89eb1SStefano Zampini           mat->num_entries = nnz;
1968afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1969afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1970aa372e3fSPaul Mullowney 
1971abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1972abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1973aa372e3fSPaul Mullowney 
1974abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1975abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1976aa372e3fSPaul Mullowney 
1977aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
197857d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1979aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1980aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1981afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1982afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1983afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1984afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
198557d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1986aa372e3fSPaul Mullowney           /* assign the pointer */
1987aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1988aa372e3fSPaul Mullowney 
1989afb2bd1cSJunchao Zhang           if (mat) {
1990afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1991afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1992afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1993afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1994087f3262SPaul Mullowney           }
1995afb2bd1cSJunchao Zhang          #endif
1996087f3262SPaul Mullowney         }
1997ca45077fSPaul Mullowney 
1998aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1999213423ffSJunchao Zhang         if (a->compressedrow.use) {
2000213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
2001aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2002aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
2003213423ffSJunchao Zhang           tmp = m;
2004213423ffSJunchao Zhang         } else {
2005213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2006213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2007213423ffSJunchao Zhang           tmp = 0;
2008213423ffSJunchao Zhang         }
2009213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2010aa372e3fSPaul Mullowney 
2011aa372e3fSPaul Mullowney         /* assign the pointer */
2012aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
20139ae82921SPaul Mullowney       } catch(char *ex) {
20149ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
20159ae82921SPaul Mullowney       }
201605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
201785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
201834d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
201934d6c7a5SJose E. Roman     }
2020abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
20219ae82921SPaul Mullowney   }
20229ae82921SPaul Mullowney   PetscFunctionReturn(0);
20239ae82921SPaul Mullowney }
20249ae82921SPaul Mullowney 
2025c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
2026aa372e3fSPaul Mullowney {
2027aa372e3fSPaul Mullowney   template <typename Tuple>
2028aa372e3fSPaul Mullowney   __host__ __device__
2029aa372e3fSPaul Mullowney   void operator()(Tuple t)
2030aa372e3fSPaul Mullowney   {
2031aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2032aa372e3fSPaul Mullowney   }
2033aa372e3fSPaul Mullowney };
2034aa372e3fSPaul Mullowney 
20357e8381f9SStefano Zampini struct VecCUDAEquals
20367e8381f9SStefano Zampini {
20377e8381f9SStefano Zampini   template <typename Tuple>
20387e8381f9SStefano Zampini   __host__ __device__
20397e8381f9SStefano Zampini   void operator()(Tuple t)
20407e8381f9SStefano Zampini   {
20417e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
20427e8381f9SStefano Zampini   }
20437e8381f9SStefano Zampini };
20447e8381f9SStefano Zampini 
2045e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
2046e6e9a74fSStefano Zampini {
2047e6e9a74fSStefano Zampini   template <typename Tuple>
2048e6e9a74fSStefano Zampini   __host__ __device__
2049e6e9a74fSStefano Zampini   void operator()(Tuple t)
2050e6e9a74fSStefano Zampini   {
2051e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2052e6e9a74fSStefano Zampini   }
2053e6e9a74fSStefano Zampini };
2054e6e9a74fSStefano Zampini 
2055afb2bd1cSJunchao Zhang struct MatMatCusparse {
2056ccdfe979SStefano Zampini   PetscBool             cisdense;
2057ccdfe979SStefano Zampini   PetscScalar           *Bt;
2058ccdfe979SStefano Zampini   Mat                   X;
2059fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2060fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2061fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2062b4285af6SJunchao Zhang 
2063afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2064fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2065afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2066afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2067afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2068afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2069b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2070b4285af6SJunchao Zhang   void                  *dBuffer4;
2071b4285af6SJunchao Zhang   void                  *dBuffer5;
2072b4285af6SJunchao Zhang  #endif
2073fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2074fcdce8c4SStefano Zampini   void                  *mmBuffer;
2075fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2076fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2077afb2bd1cSJunchao Zhang #endif
2078afb2bd1cSJunchao Zhang };
2079ccdfe979SStefano Zampini 
2080ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2081ccdfe979SStefano Zampini {
2082ccdfe979SStefano Zampini   PetscErrorCode   ierr;
2083ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2084ccdfe979SStefano Zampini   cudaError_t      cerr;
2085fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2086fcdce8c4SStefano Zampini   cusparseStatus_t stat;
2087fcdce8c4SStefano Zampini  #endif
2088ccdfe979SStefano Zampini 
2089ccdfe979SStefano Zampini   PetscFunctionBegin;
2090ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2091fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2092afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2093fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2094afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2095afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2096fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2097b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2098b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2099b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2100b4285af6SJunchao Zhang  #endif
2101b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2102b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2103afb2bd1cSJunchao Zhang  #endif
2104ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2105ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
2106ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2107ccdfe979SStefano Zampini }
2108ccdfe979SStefano Zampini 
2109ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2110ccdfe979SStefano Zampini 
2111ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2112ccdfe979SStefano Zampini {
2113ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2114ccdfe979SStefano Zampini   Mat                          A,B;
2115afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2116ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2117ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2118ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2119ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2120ccdfe979SStefano Zampini   const PetscScalar            *barray;
2121ccdfe979SStefano Zampini   PetscScalar                  *carray;
2122ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2123ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2124ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2125ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2126ccdfe979SStefano Zampini 
2127ccdfe979SStefano Zampini   PetscFunctionBegin;
2128ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2129e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2130ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2131ccdfe979SStefano Zampini   A    = product->A;
2132ccdfe979SStefano Zampini   B    = product->B;
2133ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2134e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2135ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2136ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2137ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2138ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2139ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2140ccdfe979SStefano Zampini   switch (product->type) {
2141ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2142ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2143ccdfe979SStefano Zampini     mat = cusp->mat;
2144ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2145ccdfe979SStefano Zampini     m   = A->rmap->n;
2146ccdfe979SStefano Zampini     n   = B->cmap->n;
2147ccdfe979SStefano Zampini     break;
2148ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
21491a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2150e6e9a74fSStefano Zampini       mat = cusp->mat;
2151e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2152e6e9a74fSStefano Zampini     } else {
21533606e59fSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2154ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2155ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2156e6e9a74fSStefano Zampini     }
2157ccdfe979SStefano Zampini     m = A->cmap->n;
2158ccdfe979SStefano Zampini     n = B->cmap->n;
2159ccdfe979SStefano Zampini     break;
2160ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2161ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2162ccdfe979SStefano Zampini     mat = cusp->mat;
2163ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2164ccdfe979SStefano Zampini     m   = A->rmap->n;
2165ccdfe979SStefano Zampini     n   = B->rmap->n;
2166ccdfe979SStefano Zampini     break;
2167ccdfe979SStefano Zampini   default:
2168e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2169ccdfe979SStefano Zampini   }
2170e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2171ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2172ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2173ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2174afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2175ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2176afb2bd1cSJunchao Zhang 
2177ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2178c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2179c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2180c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2181c8378d12SStefano Zampini   } else {
2182c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2183c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2184c8378d12SStefano Zampini   }
2185c8378d12SStefano Zampini 
2186c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2187afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2188afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2189a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2190afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2191fcdce8c4SStefano Zampini     size_t mmBufferSize;
2192afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2193afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2194afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2195afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2196afb2bd1cSJunchao Zhang     }
2197c8378d12SStefano Zampini 
2198afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2199afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2200afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2201afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2202afb2bd1cSJunchao Zhang     }
2203afb2bd1cSJunchao Zhang 
2204afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2205afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2206afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2207afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2208afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2209afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2210afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2211afb2bd1cSJunchao Zhang     }
2212afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2213afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2214afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2215fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2216fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2217ee7b52eaSHong Zhang       cudaError_t cerr;
2218fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2219fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2220fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2221fcdce8c4SStefano Zampini     }
2222afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2223afb2bd1cSJunchao Zhang   } else {
2224afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2225afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2226afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2227afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2228afb2bd1cSJunchao Zhang   }
2229afb2bd1cSJunchao Zhang 
2230afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2231afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2232afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2233afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2234fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2235afb2bd1cSJunchao Zhang  #else
2236afb2bd1cSJunchao Zhang   PetscInt k;
2237afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2238ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2239ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2240ccdfe979SStefano Zampini     cublasStatus_t cerr;
2241ccdfe979SStefano Zampini 
2242ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2243ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2244ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2245ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2246ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2247ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2248ccdfe979SStefano Zampini     blda = B->cmap->n;
2249afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2250afb2bd1cSJunchao Zhang   } else {
2251afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2252ccdfe979SStefano Zampini   }
2253ccdfe979SStefano Zampini 
2254afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2255ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2256afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2257ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2258ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2259ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2260ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2261ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2262afb2bd1cSJunchao Zhang  #endif
2263c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2264c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2265ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2266ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2267ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2268ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2269ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2270ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2271ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2272ccdfe979SStefano Zampini   } else {
2273ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2274ccdfe979SStefano Zampini   }
2275ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2276ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2277ccdfe979SStefano Zampini   }
2278ccdfe979SStefano Zampini   if (!biscuda) {
2279ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2280ccdfe979SStefano Zampini   }
2281ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2282ccdfe979SStefano Zampini }
2283ccdfe979SStefano Zampini 
2284ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2285ccdfe979SStefano Zampini {
2286ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2287ccdfe979SStefano Zampini   Mat                A,B;
2288ccdfe979SStefano Zampini   PetscInt           m,n;
2289ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2290ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2291ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2292ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2293ccdfe979SStefano Zampini 
2294ccdfe979SStefano Zampini   PetscFunctionBegin;
2295ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2296e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2297ccdfe979SStefano Zampini   A    = product->A;
2298ccdfe979SStefano Zampini   B    = product->B;
2299ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2300e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2301ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2302e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2303ccdfe979SStefano Zampini   switch (product->type) {
2304ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2305ccdfe979SStefano Zampini     m = A->rmap->n;
2306ccdfe979SStefano Zampini     n = B->cmap->n;
2307ccdfe979SStefano Zampini     break;
2308ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2309ccdfe979SStefano Zampini     m = A->cmap->n;
2310ccdfe979SStefano Zampini     n = B->cmap->n;
2311ccdfe979SStefano Zampini     break;
2312ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2313ccdfe979SStefano Zampini     m = A->rmap->n;
2314ccdfe979SStefano Zampini     n = B->rmap->n;
2315ccdfe979SStefano Zampini     break;
2316ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2317ccdfe979SStefano Zampini     m = B->cmap->n;
2318ccdfe979SStefano Zampini     n = B->cmap->n;
2319ccdfe979SStefano Zampini     break;
2320ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2321ccdfe979SStefano Zampini     m = B->rmap->n;
2322ccdfe979SStefano Zampini     n = B->rmap->n;
2323ccdfe979SStefano Zampini     break;
2324ccdfe979SStefano Zampini   default:
2325e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2326ccdfe979SStefano Zampini   }
2327ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2328ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2329ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2330ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2331ccdfe979SStefano Zampini 
2332ccdfe979SStefano Zampini   /* product data */
2333ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2334ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2335afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2336afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2337ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2338afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2339ccdfe979SStefano Zampini   }
2340afb2bd1cSJunchao Zhang  #endif
2341ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2342ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2343ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2344ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2345ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2346ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2347ccdfe979SStefano Zampini     } else {
2348ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2349ccdfe979SStefano Zampini     }
2350ccdfe979SStefano Zampini   }
2351ccdfe979SStefano Zampini   C->product->data    = mmdata;
2352ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2353ccdfe979SStefano Zampini 
2354ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2355ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2356ccdfe979SStefano Zampini }
2357ccdfe979SStefano Zampini 
2358fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2359ccdfe979SStefano Zampini {
2360ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2361fcdce8c4SStefano Zampini   Mat                          A,B;
2362fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2363fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2364fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2365fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2366fcdce8c4SStefano Zampini   PetscBool                    flg;
2367ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2368fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2369fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2370fcdce8c4SStefano Zampini   MatProductType               ptype;
2371fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2372fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2373fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2374fcdce8c4SStefano Zampini #endif
2375b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2376ccdfe979SStefano Zampini 
2377ccdfe979SStefano Zampini   PetscFunctionBegin;
2378ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2379e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2380fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2381e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2382fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2383fcdce8c4SStefano Zampini   A = product->A;
2384fcdce8c4SStefano Zampini   B = product->B;
2385fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2386fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2387fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2388e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2389fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2390e8d2b73aSMark Adams     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2391fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2392e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2393fcdce8c4SStefano Zampini     goto finalize;
2394fcdce8c4SStefano Zampini   }
2395fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2396fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2397e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2398fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2399e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2400fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2401fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2402fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2403fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2404fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2405e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2406e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2407e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2408fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2409fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2410fcdce8c4SStefano Zampini 
2411fcdce8c4SStefano Zampini   ptype = product->type;
2412fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2413fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2414fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2415fa046f9fSJunchao Zhang   }
2416fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2417fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2418fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2419fa046f9fSJunchao Zhang   }
2420fcdce8c4SStefano Zampini   switch (ptype) {
2421fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2422fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2423fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2424fcdce8c4SStefano Zampini     break;
2425fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2426fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2427fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2428fcdce8c4SStefano Zampini     break;
2429fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2430fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2431fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2432fcdce8c4SStefano Zampini     break;
2433fcdce8c4SStefano Zampini   default:
2434e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2435fcdce8c4SStefano Zampini   }
2436fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2437e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2438e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2439e8d2b73aSMark Adams   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2440fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2441fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2442fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2443e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2444e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2445e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2446fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2447fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2448fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2449b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2450b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2451b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2452b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2453b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2454b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2455b4285af6SJunchao Zhang   #else
2456b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2457fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2458fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2459fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2460b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2461fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2462fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2463b4285af6SJunchao Zhang   #endif
2464fcdce8c4SStefano Zampini #else
2465b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2466fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2467fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2468fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2469fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2470fcdce8c4SStefano Zampini #endif
2471fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2472fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2473fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2474fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2475fcdce8c4SStefano Zampini finalize:
2476fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2477*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2478fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2479*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2480fcdce8c4SStefano Zampini   c->reallocs         = 0;
2481fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2482fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2483fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2484fcdce8c4SStefano Zampini   C->num_ass++;
2485ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2486ccdfe979SStefano Zampini }
2487fcdce8c4SStefano Zampini 
2488fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2489fcdce8c4SStefano Zampini {
2490fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2491fcdce8c4SStefano Zampini   Mat                          A,B;
2492fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2493fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2494fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2495fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2496fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2497fcdce8c4SStefano Zampini   PetscBool                    flg;
2498fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2499fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2500fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2501fcdce8c4SStefano Zampini   MatProductType               ptype;
2502fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2503fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2504fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2505fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2506fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2507fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2508fcdce8c4SStefano Zampini #else
2509fcdce8c4SStefano Zampini   int                          cnz;
2510fcdce8c4SStefano Zampini #endif
2511b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2512fcdce8c4SStefano Zampini 
2513fcdce8c4SStefano Zampini   PetscFunctionBegin;
2514fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2515e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2516fcdce8c4SStefano Zampini   A    = product->A;
2517fcdce8c4SStefano Zampini   B    = product->B;
2518fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2519e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2520fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2521e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2522fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2523fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2524fcdce8c4SStefano Zampini   /* product data */
2525fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2526fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2527fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2528fcdce8c4SStefano Zampini 
2529fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2530fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2531d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2532d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2533d60bce21SJunchao Zhang   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2534d60bce21SJunchao Zhang   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2535d60bce21SJunchao Zhang 
2536fcdce8c4SStefano Zampini   ptype = product->type;
2537fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2538fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2539fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2540fa046f9fSJunchao Zhang   }
2541fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2542fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2543fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2544fa046f9fSJunchao Zhang   }
2545fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2546fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2547fcdce8c4SStefano Zampini   switch (ptype) {
2548fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2549fcdce8c4SStefano Zampini     m = A->rmap->n;
2550fcdce8c4SStefano Zampini     n = B->cmap->n;
2551fcdce8c4SStefano Zampini     k = A->cmap->n;
2552fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2553fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2554fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2555fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2556fcdce8c4SStefano Zampini     break;
2557fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2558fcdce8c4SStefano Zampini     m = A->cmap->n;
2559fcdce8c4SStefano Zampini     n = B->cmap->n;
2560fcdce8c4SStefano Zampini     k = A->rmap->n;
25613606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2562fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2563fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2564fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2565fcdce8c4SStefano Zampini     break;
2566fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2567fcdce8c4SStefano Zampini     m = A->rmap->n;
2568fcdce8c4SStefano Zampini     n = B->rmap->n;
2569fcdce8c4SStefano Zampini     k = A->cmap->n;
25703606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2571fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2572fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2573fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2574fcdce8c4SStefano Zampini     break;
2575fcdce8c4SStefano Zampini   default:
2576e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2577fcdce8c4SStefano Zampini   }
2578fcdce8c4SStefano Zampini 
2579fcdce8c4SStefano Zampini   /* create cusparse matrix */
2580fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2581fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2582fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2583fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2584fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2585fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2586fcdce8c4SStefano Zampini 
2587fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2588fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2589fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2590fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2591fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2592fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2593fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2594fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2595fcdce8c4SStefano Zampini   } else {
2596fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2597fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2598fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2599fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2600fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2601fcdce8c4SStefano Zampini   }
2602fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2603fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2604fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2605fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2606fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2607fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2608fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2609fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2610fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2611fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2612fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2613fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2614fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2615fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2616fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2617fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2618fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2619fcdce8c4SStefano Zampini     c->nz = 0;
2620fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2621fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2622fcdce8c4SStefano Zampini     goto finalizesym;
2623fcdce8c4SStefano Zampini   }
2624fcdce8c4SStefano Zampini 
2625e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2626e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2627fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2628fcdce8c4SStefano Zampini   if (!biscompressed) {
2629fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2630fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2631fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2632fcdce8c4SStefano Zampini #endif
2633fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2634fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2635fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2636fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2637fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2638fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2639fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2640fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2641fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2642fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2643fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2644fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2645fcdce8c4SStefano Zampini     }
2646fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2647fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2648fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2649fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2650fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2651fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2652fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2653fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2654fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2655fcdce8c4SStefano Zampini     }
2656fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2657fcdce8c4SStefano Zampini #endif
2658fcdce8c4SStefano Zampini   }
2659e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2660e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2661fcdce8c4SStefano Zampini   /* precompute flops count */
2662fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2663fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2664fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2665fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2666fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2667fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2668fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2669fcdce8c4SStefano Zampini       }
2670fcdce8c4SStefano Zampini     }
2671fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2672fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2673fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2674fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2675fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2676fcdce8c4SStefano Zampini     }
2677fcdce8c4SStefano Zampini   } else { /* TODO */
2678fcdce8c4SStefano Zampini     flops = 0.;
2679fcdce8c4SStefano Zampini   }
2680fcdce8c4SStefano Zampini 
2681fcdce8c4SStefano Zampini   mmdata->flops = flops;
2682fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2683b4285af6SJunchao Zhang 
2684fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2685fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2686fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2687fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2688fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2689fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2690fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2691b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2692b4285af6SJunchao Zhang  {
2693b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2694b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2695b4285af6SJunchao Zhang   */
2696b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2697b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2698b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2699b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2700b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2701b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2702b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2703b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2704b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2705b4285af6SJunchao Zhang 
2706b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2707b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2708b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2709b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2710b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2711b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2712b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2713b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2714b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2715b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2716b4285af6SJunchao Zhang 
2717b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2718b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2719b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2720b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2721b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2722b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2723b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2724b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2725b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2726b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2727b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2728b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2729b4285af6SJunchao Zhang 
2730b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2731b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2732b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2733b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2734b4285af6SJunchao Zhang   /* allocate matrix C */
2735b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2736b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2737b4285af6SJunchao Zhang   /* update matC with the new pointers */
2738b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2739b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2740b4285af6SJunchao Zhang 
2741b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2742b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2743b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2744b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2745b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2746b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2747b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2748b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2749b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2750b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2751b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2752b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2753b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2754*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2755b4285af6SJunchao Zhang  }
2756ae37ee31SJunchao Zhang  #else
2757b4285af6SJunchao Zhang   size_t bufSize2;
2758fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2759b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2760fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2761fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2762fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2763bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2764fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2765b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2766fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2767fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2768fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2769fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2770b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2771fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2772fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2773fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2774fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2775fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2776fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2777fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2778fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2779bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2780fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2781b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2782fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2783fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2784fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2785fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2786fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2787fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
2788*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2789fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2790fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2791fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2792fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2793fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2794fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2795b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2796fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2797fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2798ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2799fcdce8c4SStefano Zampini #else
2800fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2801b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2802fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2803fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2804fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2805fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2806fcdce8c4SStefano Zampini   c->nz = cnz;
2807fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2808fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2809fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2810fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2811fcdce8c4SStefano Zampini 
2812fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2813fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2814fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2815fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2816b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2817fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2818fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2819fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2820fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2821fcdce8c4SStefano Zampini #endif
2822fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2823fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2824fcdce8c4SStefano Zampini finalizesym:
2825fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2826fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2827fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2828fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2829fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2830fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2831fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2832fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2833fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2834fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2835fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2836fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2837fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2838fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839fcdce8c4SStefano Zampini   } else {
2840fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2841fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2842fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2843fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2844fcdce8c4SStefano Zampini   }
2845fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2846fcdce8c4SStefano Zampini     PetscInt r = 0;
2847fcdce8c4SStefano Zampini     c->i[0] = 0;
2848fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2849fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2850fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2851fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2852fcdce8c4SStefano Zampini     }
2853fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2854fcdce8c4SStefano Zampini   }
2855fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2856fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2857fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2858fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2859fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2860fcdce8c4SStefano Zampini   c->rmax = 0;
2861fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2862fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2863fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2864fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2865fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2866fcdce8c4SStefano Zampini   }
2867fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2868fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2869fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2870fcdce8c4SStefano Zampini 
2871fcdce8c4SStefano Zampini   C->nonzerostate++;
2872fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2873fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2874fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2875fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2876fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2877fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2878fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2879abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2880fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2881fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2882fcdce8c4SStefano Zampini   }
2883fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2884fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2885fcdce8c4SStefano Zampini }
2886fcdce8c4SStefano Zampini 
2887fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2888fcdce8c4SStefano Zampini 
2889fcdce8c4SStefano Zampini /* handles sparse or dense B */
2890fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2891fcdce8c4SStefano Zampini {
2892fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2893fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2894fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2895fcdce8c4SStefano Zampini 
2896fcdce8c4SStefano Zampini   PetscFunctionBegin;
2897fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2898fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2899abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2900fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2901fcdce8c4SStefano Zampini   }
2902fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2903fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2904fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2905fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2906fcdce8c4SStefano Zampini     }
2907fcdce8c4SStefano Zampini   }
290865e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
290965e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
291065e4b4d4SStefano Zampini     switch (product->type) {
291165e4b4d4SStefano Zampini     case MATPRODUCT_AB:
291265e4b4d4SStefano Zampini       if (product->api_user) {
291365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
291465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291665e4b4d4SStefano Zampini       } else {
291765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
291865e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292065e4b4d4SStefano Zampini       }
292165e4b4d4SStefano Zampini       break;
292265e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
292365e4b4d4SStefano Zampini       if (product->api_user) {
292465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
292565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292765e4b4d4SStefano Zampini       } else {
292865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
292965e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293165e4b4d4SStefano Zampini       }
293265e4b4d4SStefano Zampini       break;
293365e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
293465e4b4d4SStefano Zampini       if (product->api_user) {
293565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
293665e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293865e4b4d4SStefano Zampini       } else {
293965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
294065e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294265e4b4d4SStefano Zampini       }
294365e4b4d4SStefano Zampini       break;
294465e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
294565e4b4d4SStefano Zampini       if (product->api_user) {
294665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
294765e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294965e4b4d4SStefano Zampini       } else {
295065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
295165e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295365e4b4d4SStefano Zampini       }
295465e4b4d4SStefano Zampini       break;
295565e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
295665e4b4d4SStefano Zampini       if (product->api_user) {
295765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
295865e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
296065e4b4d4SStefano Zampini       } else {
296165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
296265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
296365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
296465e4b4d4SStefano Zampini       }
296565e4b4d4SStefano Zampini       break;
296665e4b4d4SStefano Zampini     default:
296765e4b4d4SStefano Zampini       break;
296865e4b4d4SStefano Zampini     }
296965e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
297065e4b4d4SStefano Zampini   }
297165e4b4d4SStefano Zampini   /* dispatch */
2972fcdce8c4SStefano Zampini   if (isdense) {
2973ccdfe979SStefano Zampini     switch (product->type) {
2974ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2975ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2976ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2977ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2978ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2979fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2980fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2981fcdce8c4SStefano Zampini       } else {
2982fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2983fcdce8c4SStefano Zampini       }
2984fcdce8c4SStefano Zampini       break;
2985fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2986fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2987fcdce8c4SStefano Zampini       break;
2988ccdfe979SStefano Zampini     default:
2989ccdfe979SStefano Zampini       break;
2990ccdfe979SStefano Zampini     }
2991fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2992fcdce8c4SStefano Zampini     switch (product->type) {
2993fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2994fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2995fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2996fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2997fcdce8c4SStefano Zampini       break;
2998fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2999fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
3000fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
3001fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3002fcdce8c4SStefano Zampini       break;
3003fcdce8c4SStefano Zampini     default:
3004fcdce8c4SStefano Zampini       break;
3005fcdce8c4SStefano Zampini     }
3006fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
3007fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3008fcdce8c4SStefano Zampini   }
3009ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3010ccdfe979SStefano Zampini }
3011ccdfe979SStefano Zampini 
30126fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
30139ae82921SPaul Mullowney {
3014b175d8bbSPaul Mullowney   PetscErrorCode ierr;
30159ae82921SPaul Mullowney 
30169ae82921SPaul Mullowney   PetscFunctionBegin;
3017e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3018e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3019e6e9a74fSStefano Zampini }
3020e6e9a74fSStefano Zampini 
3021e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3022e6e9a74fSStefano Zampini {
3023e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3024e6e9a74fSStefano Zampini 
3025e6e9a74fSStefano Zampini   PetscFunctionBegin;
3026e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3027e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3028e6e9a74fSStefano Zampini }
3029e6e9a74fSStefano Zampini 
3030e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3031e6e9a74fSStefano Zampini {
3032e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3033e6e9a74fSStefano Zampini 
3034e6e9a74fSStefano Zampini   PetscFunctionBegin;
3035e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3036e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3037e6e9a74fSStefano Zampini }
3038e6e9a74fSStefano Zampini 
3039e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3040e6e9a74fSStefano Zampini {
3041e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3042e6e9a74fSStefano Zampini 
3043e6e9a74fSStefano Zampini   PetscFunctionBegin;
3044e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
30459ae82921SPaul Mullowney   PetscFunctionReturn(0);
30469ae82921SPaul Mullowney }
30479ae82921SPaul Mullowney 
30486fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3049ca45077fSPaul Mullowney {
3050b175d8bbSPaul Mullowney   PetscErrorCode ierr;
3051ca45077fSPaul Mullowney 
3052ca45077fSPaul Mullowney   PetscFunctionBegin;
3053e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3054ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3055ca45077fSPaul Mullowney }
3056ca45077fSPaul Mullowney 
3057a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3058a0e72f99SJunchao Zhang {
3059a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
3060a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3061a0e72f99SJunchao Zhang }
3062a0e72f99SJunchao Zhang 
3063afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3064e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
30659ae82921SPaul Mullowney {
30669ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3067aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
30689ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3069e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3070b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
3071aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
3072e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3073e6e9a74fSStefano Zampini   PetscBool                    compressed;
3074afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3075afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3076afb2bd1cSJunchao Zhang #endif
30776e111a19SKarl Rupp 
30789ae82921SPaul Mullowney   PetscFunctionBegin;
3079e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3080e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
3081afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3082d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3083e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3084e6e9a74fSStefano Zampini   }
308534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
308634d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3087e6e9a74fSStefano Zampini   if (!trans) {
30889ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3089e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3090e6e9a74fSStefano Zampini   } else {
30911a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3092e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3093e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3094e6e9a74fSStefano Zampini     } else {
30953606e59fSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3096e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3097e6e9a74fSStefano Zampini     }
3098e6e9a74fSStefano Zampini   }
3099e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3100e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3101213423ffSJunchao Zhang 
3102e6e9a74fSStefano Zampini   try {
3103e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3104213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3105213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3106afb2bd1cSJunchao Zhang 
310785ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3108e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3109afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3110afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3111afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3112afb2bd1cSJunchao Zhang       */
3113e6e9a74fSStefano Zampini       xptr = xarray;
3114afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3115213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3116afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3117afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3118afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3119afb2bd1cSJunchao Zhang        */
3120afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3121afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3122afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3123afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3124afb2bd1cSJunchao Zhang       }
3125afb2bd1cSJunchao Zhang      #endif
3126e6e9a74fSStefano Zampini     } else {
3127afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3128afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3129afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3130afb2bd1cSJunchao Zhang        */
3131afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3132e6e9a74fSStefano Zampini       dptr = zarray;
3133e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3134afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3135e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3136a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3137e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3138e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3139e6e9a74fSStefano Zampini       }
3140afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3141afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3142afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3143afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3144afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3145afb2bd1cSJunchao Zhang       }
3146afb2bd1cSJunchao Zhang      #endif
3147e6e9a74fSStefano Zampini     }
31489ae82921SPaul Mullowney 
3149afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3150aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3151afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3152afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3153afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3154ee7b52eaSHong Zhang         cudaError_t cerr;
3155afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3156afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3157afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3158afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3159afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3160afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3161afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3162afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3163afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3164afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3165afb2bd1cSJunchao Zhang 
3166afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3167afb2bd1cSJunchao Zhang       } else {
3168afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3169afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3170afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3171afb2bd1cSJunchao Zhang       }
3172afb2bd1cSJunchao Zhang 
3173afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3174afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
31753606e59fSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3176afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3177afb2bd1cSJunchao Zhang                                beta,
3178afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3179afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3180afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3181afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3182afb2bd1cSJunchao Zhang      #else
31837656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3184e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3185a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3186afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3187aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3188e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
318957d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3190afb2bd1cSJunchao Zhang      #endif
3191aa372e3fSPaul Mullowney     } else {
3192213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3193afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3194afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3195afb2bd1cSJunchao Zhang        #else
3196301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3197e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3198afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3199e6e9a74fSStefano Zampini                                  xptr, beta,
320057d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3201afb2bd1cSJunchao Zhang        #endif
3202a65300a6SPaul Mullowney       }
3203aa372e3fSPaul Mullowney     }
3204958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3205aa372e3fSPaul Mullowney 
3206e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3207213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3208213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3209213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3210e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3211213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
32127656d835SStefano Zampini         }
3213213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3214c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
32157656d835SStefano Zampini       }
32167656d835SStefano Zampini 
3217213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3218213423ffSJunchao Zhang       if (compressed) {
3219e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3220a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3221a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3222a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3223a0e72f99SJunchao Zhang          */
3224a0e72f99SJunchao Zhang        #if 0
3225a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3226a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3227a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3228e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3229c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3230a0e72f99SJunchao Zhang        #else
3231a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3232a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3233a0e72f99SJunchao Zhang        #endif
3234958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3235e6e9a74fSStefano Zampini       }
3236e6e9a74fSStefano Zampini     } else {
3237e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3238e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3239e6e9a74fSStefano Zampini       }
3240e6e9a74fSStefano Zampini     }
3241e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3242213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3243213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
32449ae82921SPaul Mullowney   } catch(char *ex) {
32459ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
32469ae82921SPaul Mullowney   }
3247e6e9a74fSStefano Zampini   if (yy) {
3248958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3249e6e9a74fSStefano Zampini   } else {
3250e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3251e6e9a74fSStefano Zampini   }
32529ae82921SPaul Mullowney   PetscFunctionReturn(0);
32539ae82921SPaul Mullowney }
32549ae82921SPaul Mullowney 
32556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3256ca45077fSPaul Mullowney {
3257b175d8bbSPaul Mullowney   PetscErrorCode ierr;
32586e111a19SKarl Rupp 
3259ca45077fSPaul Mullowney   PetscFunctionBegin;
3260e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3261ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3262ca45077fSPaul Mullowney }
3263ca45077fSPaul Mullowney 
32646fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
32659ae82921SPaul Mullowney {
32669ae82921SPaul Mullowney   PetscErrorCode     ierr;
3267042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3268042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
32693fa6b06aSMark Adams 
3270042217e8SBarry Smith   PetscFunctionBegin;
3271042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3272042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3273042217e8SBarry Smith     cudaError_t cerr;
3274042217e8SBarry Smith 
3275042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3276042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3277042217e8SBarry Smith     cusp->deviceMat = NULL;
3278042217e8SBarry Smith   }
32799ae82921SPaul Mullowney   PetscFunctionReturn(0);
32809ae82921SPaul Mullowney }
32819ae82921SPaul Mullowney 
32829ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3283e057df02SPaul Mullowney /*@
32849ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3285e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3286e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3287e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3288e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3289e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32909ae82921SPaul Mullowney 
3291d083f849SBarry Smith    Collective
32929ae82921SPaul Mullowney 
32939ae82921SPaul Mullowney    Input Parameters:
32949ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32959ae82921SPaul Mullowney .  m - number of rows
32969ae82921SPaul Mullowney .  n - number of columns
32979ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32989ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32990298fd71SBarry Smith          (possibly different for each row) or NULL
33009ae82921SPaul Mullowney 
33019ae82921SPaul Mullowney    Output Parameter:
33029ae82921SPaul Mullowney .  A - the matrix
33039ae82921SPaul Mullowney 
33049ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
33059ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
33069ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
33079ae82921SPaul Mullowney 
33089ae82921SPaul Mullowney    Notes:
33099ae82921SPaul Mullowney    If nnz is given then nz is ignored
33109ae82921SPaul Mullowney 
33119ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
33129ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
33139ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
33149ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
33159ae82921SPaul Mullowney 
33169ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
33170298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
33189ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
33199ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
33209ae82921SPaul Mullowney 
33219ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
33229ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
33239ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
33249ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
33259ae82921SPaul Mullowney 
33269ae82921SPaul Mullowney    Level: intermediate
33279ae82921SPaul Mullowney 
3328e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
33299ae82921SPaul Mullowney @*/
33309ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
33319ae82921SPaul Mullowney {
33329ae82921SPaul Mullowney   PetscErrorCode ierr;
33339ae82921SPaul Mullowney 
33349ae82921SPaul Mullowney   PetscFunctionBegin;
33359ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
33369ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
33379ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
33389ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
33399ae82921SPaul Mullowney   PetscFunctionReturn(0);
33409ae82921SPaul Mullowney }
33419ae82921SPaul Mullowney 
33426fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
33439ae82921SPaul Mullowney {
33449ae82921SPaul Mullowney   PetscErrorCode ierr;
3345ab25e6cbSDominic Meiser 
33469ae82921SPaul Mullowney   PetscFunctionBegin;
33479ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3348470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
33499ae82921SPaul Mullowney   } else {
3350470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3351aa372e3fSPaul Mullowney   }
3352c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3353ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3354365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3355ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3356ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3357fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3358ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
33597e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
33607e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3361ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
33629ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
33639ae82921SPaul Mullowney   PetscFunctionReturn(0);
33649ae82921SPaul Mullowney }
33659ae82921SPaul Mullowney 
3366ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
336795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
33689ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
33699ff858a8SKarl Rupp {
33709ff858a8SKarl Rupp   PetscErrorCode ierr;
33719ff858a8SKarl Rupp 
33729ff858a8SKarl Rupp   PetscFunctionBegin;
33739ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3374ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
33759ff858a8SKarl Rupp   PetscFunctionReturn(0);
33769ff858a8SKarl Rupp }
33779ff858a8SKarl Rupp 
3378039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
337995639643SRichard Tran Mills {
3380e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3381a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3382039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3383039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3384039c6fbaSStefano Zampini   PetscScalar        *ay;
3385039c6fbaSStefano Zampini   const PetscScalar  *ax;
3386039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3387e6e9a74fSStefano Zampini 
338895639643SRichard Tran Mills   PetscFunctionBegin;
3389a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3390a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3391039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3392a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3393a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3394a587d139SMark     PetscFunctionReturn(0);
339595639643SRichard Tran Mills   }
3396039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3397a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3398a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3399e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3400e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3401039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3402039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3403039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3404039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3405039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3406039c6fbaSStefano Zampini     if (eq) {
3407039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3408039c6fbaSStefano Zampini     }
3409039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3410039c6fbaSStefano Zampini   }
3411d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3412d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3413039c6fbaSStefano Zampini 
3414039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3415039c6fbaSStefano Zampini     cusparseStatus_t stat;
3416039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3417039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3418039c6fbaSStefano Zampini     size_t           bufferSize;
3419039c6fbaSStefano Zampini     void             *buffer;
3420ee7b52eaSHong Zhang     cudaError_t      cerr;
3421039c6fbaSStefano Zampini #endif
3422039c6fbaSStefano Zampini 
3423039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3424039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3425039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3426039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3427039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3428039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3429039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3430039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3431039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3432039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3433039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3434039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3435039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3436039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3437039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3438039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3439039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3440039c6fbaSStefano Zampini #else
3441039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3442039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3443039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3444039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3445039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3446039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3447039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3448039c6fbaSStefano Zampini #endif
3449039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3450039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3451039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3452039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3453039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3454a587d139SMark     cublasHandle_t cublasv2handle;
3455039c6fbaSStefano Zampini     cublasStatus_t berr;
3456a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3457039c6fbaSStefano Zampini 
3458039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3459039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3460a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3461a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3462a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3463039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3464a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3465a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3466039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3467039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3468a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3469039c6fbaSStefano Zampini   } else {
3470a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3471d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3472a587d139SMark   }
347395639643SRichard Tran Mills   PetscFunctionReturn(0);
347495639643SRichard Tran Mills }
347595639643SRichard Tran Mills 
347633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
347733c9ba73SStefano Zampini {
347833c9ba73SStefano Zampini   PetscErrorCode ierr;
347933c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
348033c9ba73SStefano Zampini   PetscScalar    *ay;
348133c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
348233c9ba73SStefano Zampini   cublasStatus_t berr;
348333c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
348433c9ba73SStefano Zampini 
348533c9ba73SStefano Zampini   PetscFunctionBegin;
348633c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
348733c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
348833c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
348933c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
349033c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
349133c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
349233c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349333c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
349433c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
349533c9ba73SStefano Zampini   PetscFunctionReturn(0);
349633c9ba73SStefano Zampini }
349733c9ba73SStefano Zampini 
34983fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34993fa6b06aSMark Adams {
35003fa6b06aSMark Adams   PetscErrorCode ierr;
35017e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3502a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
35037e8381f9SStefano Zampini 
35043fa6b06aSMark Adams   PetscFunctionBegin;
35053fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
35063fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
35077e8381f9SStefano Zampini     if (spptr->mat) {
35087e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
35097e8381f9SStefano Zampini       if (matrix->values) {
35107e8381f9SStefano Zampini         both = PETSC_TRUE;
35117e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35127e8381f9SStefano Zampini       }
35137e8381f9SStefano Zampini     }
35147e8381f9SStefano Zampini     if (spptr->matTranspose) {
35157e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
35167e8381f9SStefano Zampini       if (matrix->values) {
35177e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35187e8381f9SStefano Zampini       }
35197e8381f9SStefano Zampini     }
35203fa6b06aSMark Adams   }
3521a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3522a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3523a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
35247e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3525a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
35263fa6b06aSMark Adams   PetscFunctionReturn(0);
35273fa6b06aSMark Adams }
35283fa6b06aSMark Adams 
3529a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3530a587d139SMark {
3531a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3532a587d139SMark   PetscErrorCode ierr;
3533a587d139SMark 
3534a587d139SMark   PetscFunctionBegin;
35359a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
35369a14fc28SStefano Zampini     A->boundtocpu = flg;
35379a14fc28SStefano Zampini     PetscFunctionReturn(0);
35389a14fc28SStefano Zampini   }
3539a587d139SMark   if (flg) {
3540a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3541a587d139SMark 
354233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3543a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3544a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3545a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3546a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3547a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3548a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3549a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3550a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3551fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
355267a45760SJunchao Zhang     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3553c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3554a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3555a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3556a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3557a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3558a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3559fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3560a587d139SMark   } else {
356133c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3562a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3563a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3564a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3565a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3566a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3567a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3568a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3569a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3570fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
357167a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
357267a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
357367a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
357467a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
357567a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
357667a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3577c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3580a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3581a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3582fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3583a587d139SMark   }
3584a587d139SMark   A->boundtocpu = flg;
3585ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3586ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3587ea500dcfSRichard Tran Mills   } else {
3588ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3589ea500dcfSRichard Tran Mills   }
3590a587d139SMark   PetscFunctionReturn(0);
3591a587d139SMark }
3592a587d139SMark 
359349735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
35949ae82921SPaul Mullowney {
35959ae82921SPaul Mullowney   PetscErrorCode   ierr;
3596aa372e3fSPaul Mullowney   cusparseStatus_t stat;
359749735bf3SStefano Zampini   Mat              B;
35989ae82921SPaul Mullowney 
35999ae82921SPaul Mullowney   PetscFunctionBegin;
3600a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
360149735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
360249735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
360349735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
360449735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
360549735bf3SStefano Zampini   }
360649735bf3SStefano Zampini   B = *newmat;
360749735bf3SStefano Zampini 
360834136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
360934136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
361034136279SStefano Zampini 
361149735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
36129ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3613e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3614e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3615e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3616a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
36171a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3618d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3619a435da06SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3620a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3621a435da06SStefano Zampini      #else
3622d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3623a435da06SStefano Zampini      #endif
3624d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3625d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3626d8132acaSStefano Zampini      #endif
36271a2c6b5cSJunchao Zhang       B->spptr = spptr;
36289ae82921SPaul Mullowney     } else {
3629e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3630e6e9a74fSStefano Zampini 
3631e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3632e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3633a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3634e6e9a74fSStefano Zampini       B->spptr = spptr;
36359ae82921SPaul Mullowney     }
3636e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
363749735bf3SStefano Zampini   }
3638693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
36399ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
36401a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
36419ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
364295639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3643693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
36442205254eSKarl Rupp 
3645e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
36469ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3647bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3648ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
3649ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3650ae48a8d0SStefano Zampini #endif
3651365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
36529ae82921SPaul Mullowney   PetscFunctionReturn(0);
36539ae82921SPaul Mullowney }
36549ae82921SPaul Mullowney 
365502fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
365602fe1965SBarry Smith {
365702fe1965SBarry Smith   PetscErrorCode ierr;
365802fe1965SBarry Smith 
365902fe1965SBarry Smith   PetscFunctionBegin;
366002fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
36610ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
366202fe1965SBarry Smith   PetscFunctionReturn(0);
366302fe1965SBarry Smith }
366402fe1965SBarry Smith 
36653ca39a21SBarry Smith /*MC
3666e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3667e057df02SPaul Mullowney 
3668e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
36692692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
36702692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3671e057df02SPaul Mullowney 
3672e057df02SPaul Mullowney    Options Database Keys:
3673e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3674aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3675a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3676365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3677e057df02SPaul Mullowney 
3678e057df02SPaul Mullowney   Level: beginner
3679e057df02SPaul Mullowney 
36808468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3681e057df02SPaul Mullowney M*/
36827f756511SDominic Meiser 
3683bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
36840f39cd5aSBarry Smith 
36853ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
368642c9c57cSBarry Smith {
368742c9c57cSBarry Smith   PetscErrorCode ierr;
368842c9c57cSBarry Smith 
368942c9c57cSBarry Smith   PetscFunctionBegin;
3690bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
36913ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36923ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36933ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36943ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3695bddcd29dSMark Adams 
369642c9c57cSBarry Smith   PetscFunctionReturn(0);
369742c9c57cSBarry Smith }
369829b38603SBarry Smith 
3699470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
37007f756511SDominic Meiser {
3701e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
37027f756511SDominic Meiser   cusparseStatus_t stat;
37037f756511SDominic Meiser 
37047f756511SDominic Meiser   PetscFunctionBegin;
37057f756511SDominic Meiser   if (*cusparsestruct) {
3706e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3707e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
37087f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
370981902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
37107e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
37117e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3712a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
37137e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3714e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
37157f756511SDominic Meiser   }
37167f756511SDominic Meiser   PetscFunctionReturn(0);
37177f756511SDominic Meiser }
37187f756511SDominic Meiser 
37197f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
37207f756511SDominic Meiser {
37217f756511SDominic Meiser   PetscFunctionBegin;
37227f756511SDominic Meiser   if (*mat) {
37237f756511SDominic Meiser     delete (*mat)->values;
37247f756511SDominic Meiser     delete (*mat)->column_indices;
37257f756511SDominic Meiser     delete (*mat)->row_offsets;
37267f756511SDominic Meiser     delete *mat;
37277f756511SDominic Meiser     *mat = 0;
37287f756511SDominic Meiser   }
37297f756511SDominic Meiser   PetscFunctionReturn(0);
37307f756511SDominic Meiser }
37317f756511SDominic Meiser 
3732470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
37337f756511SDominic Meiser {
37347f756511SDominic Meiser   cusparseStatus_t stat;
37357f756511SDominic Meiser   PetscErrorCode   ierr;
37367f756511SDominic Meiser 
37377f756511SDominic Meiser   PetscFunctionBegin;
37387f756511SDominic Meiser   if (*trifactor) {
373957d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3740afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
37417f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
37421b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
37432cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3744afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
37451b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3746afb2bd1cSJunchao Zhang    #endif
3747da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
37487f756511SDominic Meiser   }
37497f756511SDominic Meiser   PetscFunctionReturn(0);
37507f756511SDominic Meiser }
37517f756511SDominic Meiser 
3752470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
37537f756511SDominic Meiser {
37547f756511SDominic Meiser   CsrMatrix        *mat;
37557f756511SDominic Meiser   cusparseStatus_t stat;
37567f756511SDominic Meiser   cudaError_t      err;
37577f756511SDominic Meiser 
37587f756511SDominic Meiser   PetscFunctionBegin;
37597f756511SDominic Meiser   if (*matstruct) {
37607f756511SDominic Meiser     if ((*matstruct)->mat) {
37617f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3762afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3763afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3764afb2bd1cSJunchao Zhang        #else
37657f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
376657d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3767afb2bd1cSJunchao Zhang        #endif
37687f756511SDominic Meiser       } else {
37697f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
37707f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
37717f756511SDominic Meiser       }
37727f756511SDominic Meiser     }
377357d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
37747f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3775afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
37767656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
37777656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3778afb2bd1cSJunchao Zhang 
3779afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3780afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3781afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3782afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3783afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3784afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3785afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3786afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3787afb2bd1cSJunchao Zhang       }
3788afb2bd1cSJunchao Zhang     }
3789afb2bd1cSJunchao Zhang    #endif
37907f756511SDominic Meiser     delete *matstruct;
37917e8381f9SStefano Zampini     *matstruct = NULL;
37927f756511SDominic Meiser   }
37937f756511SDominic Meiser   PetscFunctionReturn(0);
37947f756511SDominic Meiser }
37957f756511SDominic Meiser 
3796e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37977f756511SDominic Meiser {
3798e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3799e6e9a74fSStefano Zampini 
38007f756511SDominic Meiser   PetscFunctionBegin;
38017f756511SDominic Meiser   if (*trifactors) {
3802e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3803e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3804e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3805e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
38067f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
38077f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
38087f756511SDominic Meiser     delete (*trifactors)->workVector;
38097e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
38107e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
38117e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3812bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3813bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3814e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3815ccdfe979SStefano Zampini   }
3816ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3817ccdfe979SStefano Zampini }
3818ccdfe979SStefano Zampini 
3819ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3820ccdfe979SStefano Zampini {
3821e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3822ccdfe979SStefano Zampini   cusparseHandle_t handle;
3823ccdfe979SStefano Zampini   cusparseStatus_t stat;
3824ccdfe979SStefano Zampini 
3825ccdfe979SStefano Zampini   PetscFunctionBegin;
3826ccdfe979SStefano Zampini   if (*trifactors) {
3827e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
38287f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
382957d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
38307f756511SDominic Meiser     }
3831e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
38327f756511SDominic Meiser   }
38337f756511SDominic Meiser   PetscFunctionReturn(0);
38347f756511SDominic Meiser }
38357e8381f9SStefano Zampini 
38367e8381f9SStefano Zampini struct IJCompare
38377e8381f9SStefano Zampini {
38387e8381f9SStefano Zampini   __host__ __device__
38397e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38407e8381f9SStefano Zampini   {
38417e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
38427e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
38437e8381f9SStefano Zampini     return false;
38447e8381f9SStefano Zampini   }
38457e8381f9SStefano Zampini };
38467e8381f9SStefano Zampini 
38477e8381f9SStefano Zampini struct IJEqual
38487e8381f9SStefano Zampini {
38497e8381f9SStefano Zampini   __host__ __device__
38507e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38517e8381f9SStefano Zampini   {
38527e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
38537e8381f9SStefano Zampini     return true;
38547e8381f9SStefano Zampini   }
38557e8381f9SStefano Zampini };
38567e8381f9SStefano Zampini 
38577e8381f9SStefano Zampini struct IJDiff
38587e8381f9SStefano Zampini {
38597e8381f9SStefano Zampini   __host__ __device__
38607e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38617e8381f9SStefano Zampini   {
38627e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
38637e8381f9SStefano Zampini   }
38647e8381f9SStefano Zampini };
38657e8381f9SStefano Zampini 
38667e8381f9SStefano Zampini struct IJSum
38677e8381f9SStefano Zampini {
38687e8381f9SStefano Zampini   __host__ __device__
38697e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38707e8381f9SStefano Zampini   {
38717e8381f9SStefano Zampini     return t1||t2;
38727e8381f9SStefano Zampini   }
38737e8381f9SStefano Zampini };
38747e8381f9SStefano Zampini 
38757e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3876e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
38777e8381f9SStefano Zampini {
38787e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3879fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3880bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
388108391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
38827e8381f9SStefano Zampini   CsrMatrix                             *matrix;
38837e8381f9SStefano Zampini   PetscErrorCode                        ierr;
38847e8381f9SStefano Zampini   PetscInt                              n;
38857e8381f9SStefano Zampini 
38867e8381f9SStefano Zampini   PetscFunctionBegin;
38877e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
38887e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
38897e8381f9SStefano Zampini   if (!cusp->cooPerm) {
38907e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38917e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38927e8381f9SStefano Zampini     PetscFunctionReturn(0);
38937e8381f9SStefano Zampini   }
38947e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
38957e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3896e61fc153SStefano Zampini   if (!v) {
3897e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3898e61fc153SStefano Zampini     goto finalize;
38997e8381f9SStefano Zampini   }
3900e61fc153SStefano Zampini   n = cusp->cooPerm->size();
390108391a17SStefano Zampini   if (isCudaMem(v)) {
390208391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
390308391a17SStefano Zampini   } else {
3904e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3905e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
390608391a17SStefano Zampini     d_v = cooPerm_v->data();
3907e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
390808391a17SStefano Zampini   }
3909bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3910e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3911ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3912bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
391308391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3914ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3915ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3916ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3917ddea5d60SJunchao Zhang       */
3918e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3919e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3920e61fc153SStefano Zampini       delete cooPerm_w;
39217e8381f9SStefano Zampini     } else {
3922ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
392308391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39247e8381f9SStefano Zampini                                                                 matrix->values->begin()));
392508391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39267e8381f9SStefano Zampini                                                                 matrix->values->end()));
3927ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
39287e8381f9SStefano Zampini     }
39297e8381f9SStefano Zampini   } else {
3930e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
393108391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3932e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
39337e8381f9SStefano Zampini     } else {
393408391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39357e8381f9SStefano Zampini                                                                 matrix->values->begin()));
393608391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39377e8381f9SStefano Zampini                                                                 matrix->values->end()));
39387e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
39397e8381f9SStefano Zampini     }
39407e8381f9SStefano Zampini   }
3941bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3942e61fc153SStefano Zampini finalize:
3943e61fc153SStefano Zampini   delete cooPerm_v;
39447e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3945e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3946fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3947*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3948fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3949*c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3950fcdce8c4SStefano Zampini   a->reallocs         = 0;
3951fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3952fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3953fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3954fcdce8c4SStefano Zampini   A->num_ass++;
39557e8381f9SStefano Zampini   PetscFunctionReturn(0);
39567e8381f9SStefano Zampini }
39577e8381f9SStefano Zampini 
3958a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3959a49f1ed0SStefano Zampini {
3960a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3961a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3962a49f1ed0SStefano Zampini 
3963a49f1ed0SStefano Zampini   PetscFunctionBegin;
3964a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3965a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3966a49f1ed0SStefano Zampini   if (destroy) {
3967a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3968a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3969a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3970a49f1ed0SStefano Zampini   }
39711a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3972a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3973a49f1ed0SStefano Zampini }
3974a49f1ed0SStefano Zampini 
39757e8381f9SStefano Zampini #include <thrust/binary_search.h>
3976e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
39777e8381f9SStefano Zampini {
39787e8381f9SStefano Zampini   PetscErrorCode     ierr;
39797e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
39807e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
39817e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
39827e8381f9SStefano Zampini   cudaError_t        cerr;
39837e8381f9SStefano Zampini 
39847e8381f9SStefano Zampini   PetscFunctionBegin;
39857e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
39867e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
39877e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
39887e8381f9SStefano Zampini   if (n != cooPerm_n) {
39897e8381f9SStefano Zampini     delete cusp->cooPerm;
39907e8381f9SStefano Zampini     delete cusp->cooPerm_a;
39917e8381f9SStefano Zampini     cusp->cooPerm = NULL;
39927e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
39937e8381f9SStefano Zampini   }
39947e8381f9SStefano Zampini   if (n) {
39957e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
39967e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39977e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39987e8381f9SStefano Zampini 
39997e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
40007e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
40017e8381f9SStefano Zampini 
40027e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
40037e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
40047e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
4005ddea5d60SJunchao Zhang 
4006ddea5d60SJunchao Zhang     /* Ex.
4007ddea5d60SJunchao Zhang       n = 6
4008ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4009ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4010ddea5d60SJunchao Zhang     */
40117e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
40127e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
40137e8381f9SStefano Zampini 
401408391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
40157e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4016ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4017ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
40187e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
40197e8381f9SStefano Zampini 
4020ddea5d60SJunchao Zhang     /*
4021ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4022ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4023ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4024ddea5d60SJunchao Zhang     */
4025ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4026ddea5d60SJunchao Zhang 
4027ddea5d60SJunchao Zhang     /*
4028ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4029ddea5d60SJunchao Zhang                             ^ekey
4030ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4031ddea5d60SJunchao Zhang                            ^nekye
4032ddea5d60SJunchao Zhang     */
40337e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
40347e8381f9SStefano Zampini       delete cusp->cooPerm_a;
40357e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4036ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4037ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4038ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4039ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4040ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
40417e8381f9SStefano Zampini       w[0] = 0;
4042ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4043ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
40447e8381f9SStefano Zampini     }
40457e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4046ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4047ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4048ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
404908391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40507e8381f9SStefano Zampini 
40517e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
40527e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
40537e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
40547e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
40557e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4056ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
40577e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40587e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4059fcdce8c4SStefano Zampini     a->rmax = 0;
40607e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
40617e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
40627e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40637e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
40647e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
40657e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
40667e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
40677e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
40687e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4069fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
40707e8381f9SStefano Zampini     }
4071fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
40727e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
40737e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4074fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
40757e8381f9SStefano Zampini   } else {
40767e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
40777e8381f9SStefano Zampini   }
4078e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
40797e8381f9SStefano Zampini 
40807e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4081e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
4082e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
40837e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
40847e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
40857e8381f9SStefano Zampini   A->nonzerostate++;
40867e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4087a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
40887e8381f9SStefano Zampini 
40897e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
40907e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
40917e8381f9SStefano Zampini   PetscFunctionReturn(0);
40927e8381f9SStefano Zampini }
4093ed502f03SStefano Zampini 
40945b7e41feSStefano Zampini /*@C
40955b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40965b7e41feSStefano Zampini 
40975b7e41feSStefano Zampini    Not collective
40985b7e41feSStefano Zampini 
40995b7e41feSStefano Zampini     Input Parameters:
41005b7e41feSStefano Zampini +   A - the matrix
41015b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41025b7e41feSStefano Zampini 
41035b7e41feSStefano Zampini     Output Parameters:
41045b7e41feSStefano Zampini +   ia - the CSR row pointers
41055b7e41feSStefano Zampini -   ja - the CSR column indices
41065b7e41feSStefano Zampini 
41075b7e41feSStefano Zampini     Level: developer
41085b7e41feSStefano Zampini 
41095b7e41feSStefano Zampini     Notes:
41105b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
41115b7e41feSStefano Zampini 
41125b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
41135b7e41feSStefano Zampini @*/
41145f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41155f101d05SStefano Zampini {
41165f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
41175f101d05SStefano Zampini   CsrMatrix          *csr;
41185f101d05SStefano Zampini   PetscErrorCode     ierr;
41195f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
41205f101d05SStefano Zampini 
41215f101d05SStefano Zampini   PetscFunctionBegin;
41225f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41235f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41245f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41255f101d05SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41265f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
41275f101d05SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41285f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41295f101d05SStefano Zampini   if (i) {
41305f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41315f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41325f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41335f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41345f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
41355f101d05SStefano Zampini       }
41365f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41375f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41385f101d05SStefano Zampini   }
41395f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41405f101d05SStefano Zampini   PetscFunctionReturn(0);
41415f101d05SStefano Zampini }
41425f101d05SStefano Zampini 
41435b7e41feSStefano Zampini /*@C
41445b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41455b7e41feSStefano Zampini 
41465b7e41feSStefano Zampini    Not collective
41475b7e41feSStefano Zampini 
41485b7e41feSStefano Zampini     Input Parameters:
41495b7e41feSStefano Zampini +   A - the matrix
41505b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41515b7e41feSStefano Zampini 
41525b7e41feSStefano Zampini     Output Parameters:
41535b7e41feSStefano Zampini +   ia - the CSR row pointers
41545b7e41feSStefano Zampini -   ja - the CSR column indices
41555b7e41feSStefano Zampini 
41565b7e41feSStefano Zampini     Level: developer
41575b7e41feSStefano Zampini 
41585b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41595b7e41feSStefano Zampini @*/
41605f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41615f101d05SStefano Zampini {
41625f101d05SStefano Zampini   PetscFunctionBegin;
41635f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41645f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41655f101d05SStefano Zampini   if (i) *i = NULL;
41665f101d05SStefano Zampini   if (j) *j = NULL;
41675f101d05SStefano Zampini   PetscFunctionReturn(0);
41685f101d05SStefano Zampini }
41695f101d05SStefano Zampini 
41705b7e41feSStefano Zampini /*@C
41715b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41725b7e41feSStefano Zampini 
41735b7e41feSStefano Zampini    Not Collective
41745b7e41feSStefano Zampini 
41755b7e41feSStefano Zampini    Input Parameter:
41765b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41775b7e41feSStefano Zampini 
41785b7e41feSStefano Zampini    Output Parameter:
41795b7e41feSStefano Zampini .   a - pointer to the device data
41805b7e41feSStefano Zampini 
41815b7e41feSStefano Zampini    Level: developer
41825b7e41feSStefano Zampini 
41835b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41845b7e41feSStefano Zampini 
41855b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41865b7e41feSStefano Zampini @*/
4187ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4188ed502f03SStefano Zampini {
4189ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4190ed502f03SStefano Zampini   CsrMatrix          *csr;
4191ed502f03SStefano Zampini   PetscErrorCode     ierr;
4192ed502f03SStefano Zampini 
4193ed502f03SStefano Zampini   PetscFunctionBegin;
4194ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4195ed502f03SStefano Zampini   PetscValidPointer(a,2);
4196ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4197ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4198ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
419933c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4200ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4201ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4202ed502f03SStefano Zampini   *a = csr->values->data().get();
4203ed502f03SStefano Zampini   PetscFunctionReturn(0);
4204ed502f03SStefano Zampini }
4205ed502f03SStefano Zampini 
42065b7e41feSStefano Zampini /*@C
42075b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
42085b7e41feSStefano Zampini 
42095b7e41feSStefano Zampini    Not Collective
42105b7e41feSStefano Zampini 
42115b7e41feSStefano Zampini    Input Parameter:
42125b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42135b7e41feSStefano Zampini 
42145b7e41feSStefano Zampini    Output Parameter:
42155b7e41feSStefano Zampini .   a - pointer to the device data
42165b7e41feSStefano Zampini 
42175b7e41feSStefano Zampini    Level: developer
42185b7e41feSStefano Zampini 
42195b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
42205b7e41feSStefano Zampini @*/
4221ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4222ed502f03SStefano Zampini {
4223ed502f03SStefano Zampini   PetscFunctionBegin;
4224ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4225ed502f03SStefano Zampini   PetscValidPointer(a,2);
4226ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4227ed502f03SStefano Zampini   *a = NULL;
4228ed502f03SStefano Zampini   PetscFunctionReturn(0);
4229ed502f03SStefano Zampini }
4230ed502f03SStefano Zampini 
42315b7e41feSStefano Zampini /*@C
42325b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42335b7e41feSStefano Zampini 
42345b7e41feSStefano Zampini    Not Collective
42355b7e41feSStefano Zampini 
42365b7e41feSStefano Zampini    Input Parameter:
42375b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42385b7e41feSStefano Zampini 
42395b7e41feSStefano Zampini    Output Parameter:
42405b7e41feSStefano Zampini .   a - pointer to the device data
42415b7e41feSStefano Zampini 
42425b7e41feSStefano Zampini    Level: developer
42435b7e41feSStefano Zampini 
42445b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42455b7e41feSStefano Zampini 
42465b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
42475b7e41feSStefano Zampini @*/
4248039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4249039c6fbaSStefano Zampini {
4250039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4251039c6fbaSStefano Zampini   CsrMatrix          *csr;
4252039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4253039c6fbaSStefano Zampini 
4254039c6fbaSStefano Zampini   PetscFunctionBegin;
4255039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4256039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4257039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4258039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4259039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
426033c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4261039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4262039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4263039c6fbaSStefano Zampini   *a = csr->values->data().get();
4264039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4265a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4266039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4267039c6fbaSStefano Zampini }
42685b7e41feSStefano Zampini /*@C
42695b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4270039c6fbaSStefano Zampini 
42715b7e41feSStefano Zampini    Not Collective
42725b7e41feSStefano Zampini 
42735b7e41feSStefano Zampini    Input Parameter:
42745b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42755b7e41feSStefano Zampini 
42765b7e41feSStefano Zampini    Output Parameter:
42775b7e41feSStefano Zampini .   a - pointer to the device data
42785b7e41feSStefano Zampini 
42795b7e41feSStefano Zampini    Level: developer
42805b7e41feSStefano Zampini 
42815b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42825b7e41feSStefano Zampini @*/
4283039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4284039c6fbaSStefano Zampini {
4285039c6fbaSStefano Zampini   PetscErrorCode ierr;
4286039c6fbaSStefano Zampini 
4287039c6fbaSStefano Zampini   PetscFunctionBegin;
4288039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4289039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4290039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4291039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4292039c6fbaSStefano Zampini   *a = NULL;
4293039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4294039c6fbaSStefano Zampini }
4295039c6fbaSStefano Zampini 
42965b7e41feSStefano Zampini /*@C
42975b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42985b7e41feSStefano Zampini 
42995b7e41feSStefano Zampini    Not Collective
43005b7e41feSStefano Zampini 
43015b7e41feSStefano Zampini    Input Parameter:
43025b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43035b7e41feSStefano Zampini 
43045b7e41feSStefano Zampini    Output Parameter:
43055b7e41feSStefano Zampini .   a - pointer to the device data
43065b7e41feSStefano Zampini 
43075b7e41feSStefano Zampini    Level: developer
43085b7e41feSStefano Zampini 
43095b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
43105b7e41feSStefano Zampini 
43115b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
43125b7e41feSStefano Zampini @*/
4313ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4314ed502f03SStefano Zampini {
4315ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4316ed502f03SStefano Zampini   CsrMatrix          *csr;
4317a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4318ed502f03SStefano Zampini 
4319ed502f03SStefano Zampini   PetscFunctionBegin;
4320ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4321ed502f03SStefano Zampini   PetscValidPointer(a,2);
4322ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4323ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
432433c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4325ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4326ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4327ed502f03SStefano Zampini   *a = csr->values->data().get();
4328039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4329a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4330ed502f03SStefano Zampini   PetscFunctionReturn(0);
4331ed502f03SStefano Zampini }
4332ed502f03SStefano Zampini 
43335b7e41feSStefano Zampini /*@C
43345b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43355b7e41feSStefano Zampini 
43365b7e41feSStefano Zampini    Not Collective
43375b7e41feSStefano Zampini 
43385b7e41feSStefano Zampini    Input Parameter:
43395b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43405b7e41feSStefano Zampini 
43415b7e41feSStefano Zampini    Output Parameter:
43425b7e41feSStefano Zampini .   a - pointer to the device data
43435b7e41feSStefano Zampini 
43445b7e41feSStefano Zampini    Level: developer
43455b7e41feSStefano Zampini 
43465b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
43475b7e41feSStefano Zampini @*/
4348ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4349ed502f03SStefano Zampini {
4350ed502f03SStefano Zampini   PetscErrorCode ierr;
4351ed502f03SStefano Zampini 
4352ed502f03SStefano Zampini   PetscFunctionBegin;
4353ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4354ed502f03SStefano Zampini   PetscValidPointer(a,2);
4355ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4356ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4357ed502f03SStefano Zampini   *a = NULL;
4358ed502f03SStefano Zampini   PetscFunctionReturn(0);
4359ed502f03SStefano Zampini }
4360ed502f03SStefano Zampini 
4361ed502f03SStefano Zampini struct IJCompare4
4362ed502f03SStefano Zampini {
4363ed502f03SStefano Zampini   __host__ __device__
43642ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4365ed502f03SStefano Zampini   {
4366ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4367ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4368ed502f03SStefano Zampini     return false;
4369ed502f03SStefano Zampini   }
4370ed502f03SStefano Zampini };
4371ed502f03SStefano Zampini 
43728909a122SStefano Zampini struct Shift
43738909a122SStefano Zampini {
4374ed502f03SStefano Zampini   int _shift;
4375ed502f03SStefano Zampini 
4376ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4377ed502f03SStefano Zampini   __host__ __device__
4378ed502f03SStefano Zampini   inline int operator() (const int &c)
4379ed502f03SStefano Zampini   {
4380ed502f03SStefano Zampini     return c + _shift;
4381ed502f03SStefano Zampini   }
4382ed502f03SStefano Zampini };
4383ed502f03SStefano Zampini 
4384ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4385ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4386ed502f03SStefano Zampini {
4387ed502f03SStefano Zampini   PetscErrorCode               ierr;
4388ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4389ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4390ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4391ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4392ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4393ed502f03SStefano Zampini   cusparseStatus_t             stat;
4394ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4395ed502f03SStefano Zampini   cudaError_t                  cerr;
4396ed502f03SStefano Zampini 
4397ed502f03SStefano Zampini   PetscFunctionBegin;
4398ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4399ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4400ed502f03SStefano Zampini   PetscValidPointer(C,4);
4401ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4402ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4403*c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4404ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4405ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4406ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4407ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4408ed502f03SStefano Zampini     m     = A->rmap->n;
4409ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4410ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4411ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4412ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4413ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4414ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4415ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4416ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4417ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4418ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4419ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4420ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4421ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4422ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4423ed502f03SStefano Zampini     Ccusp->nrows    = m;
4424ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4425ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4426ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4427ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4428ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4429ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4430ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4431ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4432ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4433ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4434ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4435ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4436ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4437ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4438ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4439ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4440ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4441ed502f03SStefano Zampini 
4442ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4443ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4444ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4445ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4446ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4447ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4448ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4449ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4450ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4451ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4452ed502f03SStefano Zampini     if (c->nz) {
44532ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44542ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44552ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44562ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44572ed87e7eSStefano Zampini 
4458ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4459ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4460ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4461ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4462ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4463ed502f03SStefano Zampini         }
44642ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44652ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4466ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4467ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4468ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4469ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4470ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4471ed502f03SStefano Zampini         }
44722ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44732ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4474ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
44752ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44762ed87e7eSStefano Zampini                               Aroff->data().get(),
44772ed87e7eSStefano Zampini                               Annz,
44782ed87e7eSStefano Zampini                               m,
44792ed87e7eSStefano Zampini                               Acoo->data().get(),
44802ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4481ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44822ed87e7eSStefano Zampini                               Broff->data().get(),
4483ed502f03SStefano Zampini                               Bnnz,
4484ed502f03SStefano Zampini                               m,
44852ed87e7eSStefano Zampini                               Bcoo->data().get(),
4486ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
44872ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44882ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44892ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44908909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4491ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4492ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44938909a122SStefano Zampini #else
44948909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44958909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44968909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44978909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44988909a122SStefano Zampini #endif
44992ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
45002ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
45012ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
45022ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
45032ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
45042ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4505ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4506ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4507ed502f03SStefano Zampini       thrust::advance(p2,Annz);
45082ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
45098909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
45108909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
45118909a122SStefano Zampini #endif
45122ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
45132ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
45142ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
45152ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
45162ed87e7eSStefano Zampini #else
45172ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
45182ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
45192ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
45202ed87e7eSStefano Zampini #endif
4521ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
45222ed87e7eSStefano Zampini                               Ccoo->data().get(),
4523ed502f03SStefano Zampini                               c->nz,
4524ed502f03SStefano Zampini                               m,
4525ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4526ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4527ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
45282ed87e7eSStefano Zampini       delete wPerm;
45292ed87e7eSStefano Zampini       delete Acoo;
45302ed87e7eSStefano Zampini       delete Bcoo;
45312ed87e7eSStefano Zampini       delete Ccoo;
4532ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4533ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4534ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4535ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4536ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4537ed502f03SStefano Zampini #endif
45381a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45393606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
45403606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4541ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4542ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4543ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4544ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4545ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4546ed502f03SStefano Zampini 
45471a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45481a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4549a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4550ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4551ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4552ed502f03SStefano Zampini         CcsrT->num_rows = n;
4553ed502f03SStefano Zampini         CcsrT->num_cols = m;
4554ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4555ed502f03SStefano Zampini 
4556ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4557ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4558ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4559ed502f03SStefano Zampini 
4560ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4561ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4562ed502f03SStefano Zampini         if (AT) {
4563ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4564ed502f03SStefano Zampini           thrust::advance(rT,-1);
4565ed502f03SStefano Zampini         }
4566ed502f03SStefano Zampini         if (BT) {
4567ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4568ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4569ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4570ed502f03SStefano Zampini         }
4571ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4572ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4573ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4574ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4575ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4576ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4577ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4578ed502f03SStefano Zampini 
4579ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4580ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4581ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4582ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4583ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4584ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4585ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4586ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4587ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4588ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4589ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4590ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4591ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4592ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4593ed502f03SStefano Zampini #endif
4594ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4595ed502f03SStefano Zampini       }
4596ed502f03SStefano Zampini     }
4597ed502f03SStefano Zampini 
4598ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4599ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4600ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4601ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4602ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4603ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4604ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4605ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4606ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4607ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4608ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4609ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4610ed502f03SStefano Zampini     } else {
4611ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4612ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4613ed502f03SStefano Zampini     }
4614ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4615ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4616ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4617ed502f03SStefano Zampini     c->maxnz = c->nz;
4618ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4619ed502f03SStefano Zampini     c->rmax = 0;
4620ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4621ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4622ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4623ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4624ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4625ed502f03SStefano Zampini     }
4626ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4627ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4628ed502f03SStefano Zampini     (*C)->nonzerostate++;
4629ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4630ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4631ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4632ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4633ed502f03SStefano Zampini   } else {
4634*c0aa6a63SJacob Faibussowitsch     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4635ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4636ed502f03SStefano Zampini     if (c->nz) {
4637ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4638ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4639ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4640ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4641ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4642ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4643ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4644ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4645ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4646ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4647ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4648*c0aa6a63SJacob Faibussowitsch       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4649*c0aa6a63SJacob Faibussowitsch       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4650*c0aa6a63SJacob Faibussowitsch       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4651*c0aa6a63SJacob Faibussowitsch       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4652*c0aa6a63SJacob Faibussowitsch       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4653ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4654ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4655ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4656ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4657ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4658ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4659ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4660ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4661ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4662ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4663ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4664ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4665ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4666a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
46671a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4668ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4669ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4670ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4671ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4672ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4673ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4674ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4675ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46761a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4677ed502f03SStefano Zampini       }
4678ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4679ed502f03SStefano Zampini     }
4680ed502f03SStefano Zampini   }
4681ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4682ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4683ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4684ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4685ed502f03SStefano Zampini   PetscFunctionReturn(0);
4686ed502f03SStefano Zampini }
4687c215019aSStefano Zampini 
4688c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4689c215019aSStefano Zampini {
4690c215019aSStefano Zampini   PetscErrorCode    ierr;
4691c215019aSStefano Zampini   bool              dmem;
4692c215019aSStefano Zampini   const PetscScalar *av;
4693c215019aSStefano Zampini   cudaError_t       cerr;
4694c215019aSStefano Zampini 
4695c215019aSStefano Zampini   PetscFunctionBegin;
4696c215019aSStefano Zampini   dmem = isCudaMem(v);
4697c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4698c215019aSStefano Zampini   if (n && idx) {
4699c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4700c215019aSStefano Zampini     widx.assign(idx,idx+n);
4701c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4702c215019aSStefano Zampini 
4703c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4704c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4705c215019aSStefano Zampini     if (dmem) {
4706c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4707c215019aSStefano Zampini     } else {
4708c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4709c215019aSStefano Zampini       dv = w->data();
4710c215019aSStefano Zampini     }
4711c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4712c215019aSStefano Zampini 
4713c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4714c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4715c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4716c215019aSStefano Zampini     if (w) {
4717c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4718c215019aSStefano Zampini     }
4719c215019aSStefano Zampini     delete w;
4720c215019aSStefano Zampini   } else {
4721c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4722c215019aSStefano Zampini   }
4723c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4724c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4725c215019aSStefano Zampini   PetscFunctionReturn(0);
4726c215019aSStefano Zampini }
4727