xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision afb2bd1ca7437fabdd8895a5aa9b07d4f5e78676)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18*afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19*afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20*afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21*afb2bd1cSJunchao Zhang 
22*afb2bd1cSJunchao Zhang   typedef enum {
23*afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24*afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25*afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26*afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27*afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28*afb2bd1cSJunchao Zhang 
29*afb2bd1cSJunchao Zhang   typedef enum {
30*afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31*afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32*afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33*afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34*afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35*afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36*afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37*afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38*afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39*afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40*afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41*afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42*afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43*afb2bd1cSJunchao Zhang 
44*afb2bd1cSJunchao Zhang   typedef enum {
45*afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46*afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47*afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48*afb2bd1cSJunchao Zhang   */
49*afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50*afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51*afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52*afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
749ae82921SPaul Mullowney 
757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
817f756511SDominic Meiser 
82b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83b06137fdSPaul Mullowney {
84b06137fdSPaul Mullowney   cusparseStatus_t   stat;
85b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
86b06137fdSPaul Mullowney 
87b06137fdSPaul Mullowney   PetscFunctionBegin;
88b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
8957d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
90b06137fdSPaul Mullowney   PetscFunctionReturn(0);
91b06137fdSPaul Mullowney }
92b06137fdSPaul Mullowney 
93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
94b06137fdSPaul Mullowney {
95b06137fdSPaul Mullowney   cusparseStatus_t   stat;
96b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97b06137fdSPaul Mullowney 
98b06137fdSPaul Mullowney   PetscFunctionBegin;
996b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10016a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
10157d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
10216a2e217SAlejandro Lamas Daviña     }
103b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1046b1cf21dSAlejandro Lamas Daviña   }
10557d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
106b06137fdSPaul Mullowney   PetscFunctionReturn(0);
107b06137fdSPaul Mullowney }
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
110b06137fdSPaul Mullowney {
111b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
112ccdfe979SStefano Zampini 
113b06137fdSPaul Mullowney   PetscFunctionBegin;
114ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
115b06137fdSPaul Mullowney   PetscFunctionReturn(0);
116b06137fdSPaul Mullowney }
117b06137fdSPaul Mullowney 
118ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1199ae82921SPaul Mullowney {
1209ae82921SPaul Mullowney   PetscFunctionBegin;
1219ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1229ae82921SPaul Mullowney   PetscFunctionReturn(0);
1239ae82921SPaul Mullowney }
1249ae82921SPaul Mullowney 
125c708e6cdSJed Brown /*MC
126087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
127087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
128087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
129087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
130087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
131087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
132c708e6cdSJed Brown 
1339ae82921SPaul Mullowney   Level: beginner
134c708e6cdSJed Brown 
1353ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
136c708e6cdSJed Brown M*/
1379ae82921SPaul Mullowney 
13842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1399ae82921SPaul Mullowney {
1409ae82921SPaul Mullowney   PetscErrorCode ierr;
141bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1429ae82921SPaul Mullowney 
1439ae82921SPaul Mullowney   PetscFunctionBegin;
144bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
145bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1462c7c0729SBarry Smith   (*B)->factortype = ftype;
1472c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1489ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1492205254eSKarl Rupp 
150087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
15133d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1529ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1539ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
154087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
155087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
156087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1579ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
158bc3f50f2SPaul Mullowney 
159fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1603ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1619ae82921SPaul Mullowney   PetscFunctionReturn(0);
1629ae82921SPaul Mullowney }
1639ae82921SPaul Mullowney 
164bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165ca45077fSPaul Mullowney {
166aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1676e111a19SKarl Rupp 
168ca45077fSPaul Mullowney   PetscFunctionBegin;
169ca45077fSPaul Mullowney   switch (op) {
170e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
171aa372e3fSPaul Mullowney     cusparsestruct->format = format;
172ca45077fSPaul Mullowney     break;
173e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
174aa372e3fSPaul Mullowney     cusparsestruct->format = format;
175ca45077fSPaul Mullowney     break;
176ca45077fSPaul Mullowney   default:
17736d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
178ca45077fSPaul Mullowney   }
179ca45077fSPaul Mullowney   PetscFunctionReturn(0);
180ca45077fSPaul Mullowney }
1819ae82921SPaul Mullowney 
182e057df02SPaul Mullowney /*@
183e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
184e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
185aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
186e057df02SPaul Mullowney    Not Collective
187e057df02SPaul Mullowney 
188e057df02SPaul Mullowney    Input Parameters:
1898468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
19036d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1912692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
192e057df02SPaul Mullowney 
193e057df02SPaul Mullowney    Output Parameter:
194e057df02SPaul Mullowney 
195e057df02SPaul Mullowney    Level: intermediate
196e057df02SPaul Mullowney 
1978468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
198e057df02SPaul Mullowney @*/
199e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200e057df02SPaul Mullowney {
201e057df02SPaul Mullowney   PetscErrorCode ierr;
2026e111a19SKarl Rupp 
203e057df02SPaul Mullowney   PetscFunctionBegin;
204e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
205e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
206e057df02SPaul Mullowney   PetscFunctionReturn(0);
207e057df02SPaul Mullowney }
208e057df02SPaul Mullowney 
209e6e9a74fSStefano Zampini /*@
210e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
211e6e9a74fSStefano Zampini 
212e6e9a74fSStefano Zampini    Collective on mat
213e6e9a74fSStefano Zampini 
214e6e9a74fSStefano Zampini    Input Parameters:
215e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
216e6e9a74fSStefano Zampini -  transgen - the boolean flag
217e6e9a74fSStefano Zampini 
218e6e9a74fSStefano Zampini    Level: intermediate
219e6e9a74fSStefano Zampini 
220e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
221e6e9a74fSStefano Zampini @*/
222e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
223e6e9a74fSStefano Zampini {
224e6e9a74fSStefano Zampini   PetscErrorCode ierr;
225e6e9a74fSStefano Zampini   PetscBool      flg;
226e6e9a74fSStefano Zampini 
227e6e9a74fSStefano Zampini   PetscFunctionBegin;
228e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
229e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
230e6e9a74fSStefano Zampini   if (flg) {
231e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
23254da937aSStefano Zampini 
233e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
234e6e9a74fSStefano Zampini     cusp->transgen = transgen;
23554da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
23654da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
23754da937aSStefano Zampini     }
238e6e9a74fSStefano Zampini   }
239e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
240e6e9a74fSStefano Zampini }
241e6e9a74fSStefano Zampini 
2424416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2439ae82921SPaul Mullowney {
2449ae82921SPaul Mullowney   PetscErrorCode           ierr;
245e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2469ae82921SPaul Mullowney   PetscBool                flg;
247a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2486e111a19SKarl Rupp 
2499ae82921SPaul Mullowney   PetscFunctionBegin;
250e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2519ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
25254da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
25354da937aSStefano Zampini 
25454da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
255*afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
256*afb2bd1cSJunchao Zhang 
257e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
258a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
259*afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
260*afb2bd1cSJunchao Zhang 
2614c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
262a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
263*afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
264*afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
265*afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
266*afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
267*afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
268*afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
269*afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
270*afb2bd1cSJunchao Zhang 
271*afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
272*afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
273*afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
274*afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
275*afb2bd1cSJunchao Zhang 
276*afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
277*afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
278*afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
279*afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
280*afb2bd1cSJunchao Zhang    #endif
2814c87dfd4SPaul Mullowney   }
2820af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2839ae82921SPaul Mullowney   PetscFunctionReturn(0);
2849ae82921SPaul Mullowney }
2859ae82921SPaul Mullowney 
2866fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2879ae82921SPaul Mullowney {
2889ae82921SPaul Mullowney   PetscErrorCode ierr;
2899ae82921SPaul Mullowney 
2909ae82921SPaul Mullowney   PetscFunctionBegin;
2919ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2929ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2939ae82921SPaul Mullowney   PetscFunctionReturn(0);
2949ae82921SPaul Mullowney }
2959ae82921SPaul Mullowney 
2966fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2979ae82921SPaul Mullowney {
2989ae82921SPaul Mullowney   PetscErrorCode ierr;
2999ae82921SPaul Mullowney 
3009ae82921SPaul Mullowney   PetscFunctionBegin;
3019ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3029ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3039ae82921SPaul Mullowney   PetscFunctionReturn(0);
3049ae82921SPaul Mullowney }
3059ae82921SPaul Mullowney 
306087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
307087f3262SPaul Mullowney {
308087f3262SPaul Mullowney   PetscErrorCode ierr;
309087f3262SPaul Mullowney 
310087f3262SPaul Mullowney   PetscFunctionBegin;
311087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
312087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
313087f3262SPaul Mullowney   PetscFunctionReturn(0);
314087f3262SPaul Mullowney }
315087f3262SPaul Mullowney 
316087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
317087f3262SPaul Mullowney {
318087f3262SPaul Mullowney   PetscErrorCode ierr;
319087f3262SPaul Mullowney 
320087f3262SPaul Mullowney   PetscFunctionBegin;
321087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
322087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
323087f3262SPaul Mullowney   PetscFunctionReturn(0);
324087f3262SPaul Mullowney }
325087f3262SPaul Mullowney 
326087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3279ae82921SPaul Mullowney {
3289ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3299ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
331aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3329ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3339ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3349ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3359ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3369ae82921SPaul Mullowney   PetscScalar                       *AALo;
3379ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
338b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
33957d48284SJunchao Zhang   cudaError_t                       cerr;
3409ae82921SPaul Mullowney 
3419ae82921SPaul Mullowney   PetscFunctionBegin;
342cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
343c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3449ae82921SPaul Mullowney     try {
3459ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3469ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
3479ae82921SPaul Mullowney 
3489ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
34957d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
35057d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
35157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3529ae82921SPaul Mullowney 
3539ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
3549ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
3559ae82921SPaul Mullowney       AiLo[n]  = nzLower;
3569ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
3579ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
3589ae82921SPaul Mullowney       v        = aa;
3599ae82921SPaul Mullowney       vi       = aj;
3609ae82921SPaul Mullowney       offset   = 1;
3619ae82921SPaul Mullowney       rowOffset= 1;
3629ae82921SPaul Mullowney       for (i=1; i<n; i++) {
3639ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
364e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
3659ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
3669ae82921SPaul Mullowney         rowOffset += nz+1;
3679ae82921SPaul Mullowney 
368580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
369580bdb30SBarry Smith         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3709ae82921SPaul Mullowney 
3719ae82921SPaul Mullowney         offset      += nz;
3729ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3739ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3749ae82921SPaul Mullowney         offset      += 1;
3759ae82921SPaul Mullowney 
3769ae82921SPaul Mullowney         v  += nz;
3779ae82921SPaul Mullowney         vi += nz;
3789ae82921SPaul Mullowney       }
3792205254eSKarl Rupp 
380aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
381aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3822205254eSKarl Rupp 
383aa372e3fSPaul Mullowney       /* Create the matrix description */
38457d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
38557d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
386*afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
387*afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
388*afb2bd1cSJunchao Zhang      #else
38957d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
390*afb2bd1cSJunchao Zhang      #endif
39157d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
39257d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
393aa372e3fSPaul Mullowney 
394aa372e3fSPaul Mullowney       /* set the operation */
395aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
396aa372e3fSPaul Mullowney 
397aa372e3fSPaul Mullowney       /* set the matrix */
398aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
399aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
400aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
401aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
402aa372e3fSPaul Mullowney 
403aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
404aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
405aa372e3fSPaul Mullowney 
406aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
407aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
408aa372e3fSPaul Mullowney 
409aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
410aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
411aa372e3fSPaul Mullowney 
412*afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
413*afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
414*afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
415*afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
416*afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
417*afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
418*afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
419*afb2bd1cSJunchao Zhang                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
420*afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
421*afb2bd1cSJunchao Zhang     #endif
422*afb2bd1cSJunchao Zhang 
423aa372e3fSPaul Mullowney       /* perform the solve analysis */
424aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
425aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427*afb2bd1cSJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
428*afb2bd1cSJunchao Zhang                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
429*afb2bd1cSJunchao Zhang                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
430*afb2bd1cSJunchao Zhang                              #endif
431*afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
432aa372e3fSPaul Mullowney 
433aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
434aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4352205254eSKarl Rupp 
43657d48284SJunchao Zhang       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
43757d48284SJunchao Zhang       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
43857d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
4394863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
4409ae82921SPaul Mullowney     } catch(char *ex) {
4419ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4429ae82921SPaul Mullowney     }
4439ae82921SPaul Mullowney   }
4449ae82921SPaul Mullowney   PetscFunctionReturn(0);
4459ae82921SPaul Mullowney }
4469ae82921SPaul Mullowney 
447087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4489ae82921SPaul Mullowney {
4499ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4509ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4519ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
452aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4539ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4549ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4559ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4569ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4579ae82921SPaul Mullowney   PetscScalar                       *AAUp;
4589ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4599ae82921SPaul Mullowney   PetscErrorCode                    ierr;
46057d48284SJunchao Zhang   cudaError_t                       cerr;
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney   PetscFunctionBegin;
463cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
464c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4659ae82921SPaul Mullowney     try {
4669ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4679ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
4689ae82921SPaul Mullowney 
4699ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
47057d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
47157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
47257d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
4739ae82921SPaul Mullowney 
4749ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
4759ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
4769ae82921SPaul Mullowney       AiUp[n]=nzUpper;
4779ae82921SPaul Mullowney       offset = nzUpper;
4789ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
4799ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
4809ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
4819ae82921SPaul Mullowney 
482e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
4839ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
4849ae82921SPaul Mullowney 
485e057df02SPaul Mullowney         /* decrement the offset */
4869ae82921SPaul Mullowney         offset -= (nz+1);
4879ae82921SPaul Mullowney 
488e057df02SPaul Mullowney         /* first, set the diagonal elements */
4899ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
49009f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1./v[nz];
4919ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
4929ae82921SPaul Mullowney 
493580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
494580bdb30SBarry Smith         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
4959ae82921SPaul Mullowney       }
4962205254eSKarl Rupp 
497aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
498aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
4992205254eSKarl Rupp 
500aa372e3fSPaul Mullowney       /* Create the matrix description */
50157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
50257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
503*afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
504*afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
505*afb2bd1cSJunchao Zhang      #else
50657d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
507*afb2bd1cSJunchao Zhang      #endif
50857d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
50957d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
510aa372e3fSPaul Mullowney 
511aa372e3fSPaul Mullowney       /* set the operation */
512aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
513aa372e3fSPaul Mullowney 
514aa372e3fSPaul Mullowney       /* set the matrix */
515aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
516aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
517aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
518aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
519aa372e3fSPaul Mullowney 
520aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
521aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
522aa372e3fSPaul Mullowney 
523aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
524aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
525aa372e3fSPaul Mullowney 
526aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
527aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
528aa372e3fSPaul Mullowney 
529*afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
530*afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
531*afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
532*afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
533*afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
534*afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
535*afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
536*afb2bd1cSJunchao Zhang                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
537*afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
538*afb2bd1cSJunchao Zhang     #endif
539*afb2bd1cSJunchao Zhang 
540aa372e3fSPaul Mullowney       /* perform the solve analysis */
541aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
542aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544*afb2bd1cSJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
545*afb2bd1cSJunchao Zhang                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
546*afb2bd1cSJunchao Zhang                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
547*afb2bd1cSJunchao Zhang                              #endif
548*afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
549aa372e3fSPaul Mullowney 
550aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
551aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
5522205254eSKarl Rupp 
55357d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
55457d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
55557d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
5564863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
5579ae82921SPaul Mullowney     } catch(char *ex) {
5589ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5599ae82921SPaul Mullowney     }
5609ae82921SPaul Mullowney   }
5619ae82921SPaul Mullowney   PetscFunctionReturn(0);
5629ae82921SPaul Mullowney }
5639ae82921SPaul Mullowney 
564087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
5659ae82921SPaul Mullowney {
5669ae82921SPaul Mullowney   PetscErrorCode               ierr;
5679ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
5689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5699ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
5709ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
5719ae82921SPaul Mullowney   const PetscInt               *r,*c;
5729ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
5739ae82921SPaul Mullowney 
5749ae82921SPaul Mullowney   PetscFunctionBegin;
575ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
576087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
577087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
5782205254eSKarl Rupp 
579e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
580aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
5819ae82921SPaul Mullowney 
582c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
583e057df02SPaul Mullowney   /* lower triangular indices */
5849ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5859ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5862205254eSKarl Rupp   if (!row_identity) {
587aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
588aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
5892205254eSKarl Rupp   }
5909ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5919ae82921SPaul Mullowney 
592e057df02SPaul Mullowney   /* upper triangular indices */
5939ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
5949ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5952205254eSKarl Rupp   if (!col_identity) {
596aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
597aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
5982205254eSKarl Rupp   }
5994863603aSSatish Balay 
6004863603aSSatish Balay   if (!row_identity && !col_identity) {
6014863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
6024863603aSSatish Balay   } else if(!row_identity) {
6034863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
6044863603aSSatish Balay   } else if(!col_identity) {
6054863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
6064863603aSSatish Balay   }
6074863603aSSatish Balay 
6089ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
6099ae82921SPaul Mullowney   PetscFunctionReturn(0);
6109ae82921SPaul Mullowney }
6119ae82921SPaul Mullowney 
612087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
613087f3262SPaul Mullowney {
614087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
615087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
616aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
617aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
618087f3262SPaul Mullowney   cusparseStatus_t                  stat;
619087f3262SPaul Mullowney   PetscErrorCode                    ierr;
62057d48284SJunchao Zhang   cudaError_t                       cerr;
621087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
622087f3262SPaul Mullowney   PetscScalar                       *AAUp;
623087f3262SPaul Mullowney   PetscScalar                       *AALo;
624087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
625087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
626087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
627087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
628087f3262SPaul Mullowney 
629087f3262SPaul Mullowney   PetscFunctionBegin;
630cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
631c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
632087f3262SPaul Mullowney     try {
633087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
63457d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
63557d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
63657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
63757d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
638087f3262SPaul Mullowney 
639087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
640087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
641087f3262SPaul Mullowney       AiUp[n]=nzUpper;
642087f3262SPaul Mullowney       offset = 0;
643087f3262SPaul Mullowney       for (i=0; i<n; i++) {
644087f3262SPaul Mullowney         /* set the pointers */
645087f3262SPaul Mullowney         v  = aa + ai[i];
646087f3262SPaul Mullowney         vj = aj + ai[i];
647087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
648087f3262SPaul Mullowney 
649087f3262SPaul Mullowney         /* first, set the diagonal elements */
650087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
65109f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1.0/v[nz];
652087f3262SPaul Mullowney         AiUp[i]      = offset;
65309f51544SAlejandro Lamas Daviña         AALo[offset] = (MatScalar)1.0/v[nz];
654087f3262SPaul Mullowney 
655087f3262SPaul Mullowney         offset+=1;
656087f3262SPaul Mullowney         if (nz>0) {
657f22e0265SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
658580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
659087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
660087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
661087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
662087f3262SPaul Mullowney           }
663087f3262SPaul Mullowney           offset+=nz;
664087f3262SPaul Mullowney         }
665087f3262SPaul Mullowney       }
666087f3262SPaul Mullowney 
667aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
668aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
669087f3262SPaul Mullowney 
670aa372e3fSPaul Mullowney       /* Create the matrix description */
67157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
67257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
673*afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
674*afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
675*afb2bd1cSJunchao Zhang      #else
67657d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
677*afb2bd1cSJunchao Zhang      #endif
67857d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
67957d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
680087f3262SPaul Mullowney 
681aa372e3fSPaul Mullowney       /* set the matrix */
682aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
683aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
684aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
685aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
686aa372e3fSPaul Mullowney 
687aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
688aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
689aa372e3fSPaul Mullowney 
690aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
692aa372e3fSPaul Mullowney 
693aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
695aa372e3fSPaul Mullowney 
696*afb2bd1cSJunchao Zhang       /* set the operation */
697*afb2bd1cSJunchao Zhang       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
698*afb2bd1cSJunchao Zhang 
699*afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
700*afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
701*afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
702*afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
703*afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
704*afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
705*afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
706*afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
707*afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
708*afb2bd1cSJunchao Zhang     #endif
709*afb2bd1cSJunchao Zhang 
710aa372e3fSPaul Mullowney       /* perform the solve analysis */
711aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
712aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714*afb2bd1cSJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
715*afb2bd1cSJunchao Zhang                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
716*afb2bd1cSJunchao Zhang                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
717*afb2bd1cSJunchao Zhang                               #endif
718*afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
719aa372e3fSPaul Mullowney 
720aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
721aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
722aa372e3fSPaul Mullowney 
723aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
724aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
725aa372e3fSPaul Mullowney 
726aa372e3fSPaul Mullowney       /* Create the matrix description */
72757d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
72857d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
729*afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
730*afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
731*afb2bd1cSJunchao Zhang      #else
73257d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
733*afb2bd1cSJunchao Zhang      #endif
73457d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
73557d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
736aa372e3fSPaul Mullowney 
737aa372e3fSPaul Mullowney       /* set the operation */
738aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
739aa372e3fSPaul Mullowney 
740aa372e3fSPaul Mullowney       /* set the matrix */
741aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
742aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
743aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
744aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
745aa372e3fSPaul Mullowney 
746aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
748aa372e3fSPaul Mullowney 
749aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
751aa372e3fSPaul Mullowney 
752aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
7544863603aSSatish Balay       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
755aa372e3fSPaul Mullowney 
756*afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
757*afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
758*afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
759*afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
760*afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
761*afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
762*afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
763*afb2bd1cSJunchao Zhang                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
764*afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
765*afb2bd1cSJunchao Zhang     #endif
766*afb2bd1cSJunchao Zhang 
767aa372e3fSPaul Mullowney       /* perform the solve analysis */
768aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
769aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771*afb2bd1cSJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
772*afb2bd1cSJunchao Zhang                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773*afb2bd1cSJunchao Zhang                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
774*afb2bd1cSJunchao Zhang                               #endif
775*afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
776aa372e3fSPaul Mullowney 
777aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
778aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
779087f3262SPaul Mullowney 
780c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
78157d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
78257d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
78357d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
78457d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
785087f3262SPaul Mullowney     } catch(char *ex) {
786087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
787087f3262SPaul Mullowney     }
788087f3262SPaul Mullowney   }
789087f3262SPaul Mullowney   PetscFunctionReturn(0);
790087f3262SPaul Mullowney }
791087f3262SPaul Mullowney 
792087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
7939ae82921SPaul Mullowney {
7949ae82921SPaul Mullowney   PetscErrorCode               ierr;
795087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
796087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
797087f3262SPaul Mullowney   IS                           ip = a->row;
798087f3262SPaul Mullowney   const PetscInt               *rip;
799087f3262SPaul Mullowney   PetscBool                    perm_identity;
800087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
801087f3262SPaul Mullowney 
802087f3262SPaul Mullowney   PetscFunctionBegin;
803ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
804087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
805e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
806aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
807aa372e3fSPaul Mullowney 
808087f3262SPaul Mullowney   /* lower triangular indices */
809087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
810087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
811087f3262SPaul Mullowney   if (!perm_identity) {
8124e4bbfaaSStefano Zampini     IS             iip;
8134e4bbfaaSStefano Zampini     const PetscInt *irip;
8144e4bbfaaSStefano Zampini 
8154e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
8164e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
817aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
818aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
819aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
8204e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
8214e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
8224e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
8234863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
824087f3262SPaul Mullowney   }
825087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
826087f3262SPaul Mullowney   PetscFunctionReturn(0);
827087f3262SPaul Mullowney }
828087f3262SPaul Mullowney 
8296fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
8309ae82921SPaul Mullowney {
8319ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
8329ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
8339ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
834b175d8bbSPaul Mullowney   PetscErrorCode ierr;
8359ae82921SPaul Mullowney 
8369ae82921SPaul Mullowney   PetscFunctionBegin;
8379ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
838ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
839e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8409ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
8419ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
842bda325fcSPaul Mullowney   if (row_identity && col_identity) {
843bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
844bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8454e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8464e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
847bda325fcSPaul Mullowney   } else {
848bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
849bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
8504e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8514e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
852bda325fcSPaul Mullowney   }
8538dc1d2a3SPaul Mullowney 
854e057df02SPaul Mullowney   /* get the triangular factors */
855087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
8569ae82921SPaul Mullowney   PetscFunctionReturn(0);
8579ae82921SPaul Mullowney }
8589ae82921SPaul Mullowney 
859087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
860087f3262SPaul Mullowney {
861087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
862087f3262SPaul Mullowney   IS             ip = b->row;
863087f3262SPaul Mullowney   PetscBool      perm_identity;
864b175d8bbSPaul Mullowney   PetscErrorCode ierr;
865087f3262SPaul Mullowney 
866087f3262SPaul Mullowney   PetscFunctionBegin;
867087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
868ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
869087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
870087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
871087f3262SPaul Mullowney   if (perm_identity) {
872087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
873087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8744e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8754e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
876087f3262SPaul Mullowney   } else {
877087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
878087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
8794e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8804e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
881087f3262SPaul Mullowney   }
882087f3262SPaul Mullowney 
883087f3262SPaul Mullowney   /* get the triangular factors */
884087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
885087f3262SPaul Mullowney   PetscFunctionReturn(0);
886087f3262SPaul Mullowney }
8879ae82921SPaul Mullowney 
888b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
889bda325fcSPaul Mullowney {
890bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
892aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
893aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
894aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
895bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
896aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
897aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
898aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
899aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
900b175d8bbSPaul Mullowney 
901bda325fcSPaul Mullowney   PetscFunctionBegin;
902bda325fcSPaul Mullowney 
903aa372e3fSPaul Mullowney   /*********************************************/
904aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
905aa372e3fSPaul Mullowney   /*********************************************/
906aa372e3fSPaul Mullowney 
907aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
908aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
909aa372e3fSPaul Mullowney 
910aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
911aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
912aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
913aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
914aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
916aa372e3fSPaul Mullowney 
917aa372e3fSPaul Mullowney   /* Create the matrix description */
91857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
91957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
92057d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
92157d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
92257d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
923aa372e3fSPaul Mullowney 
924aa372e3fSPaul Mullowney   /* set the operation */
925aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
926aa372e3fSPaul Mullowney 
927aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
928aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
929*afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
930*afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
931aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
932*afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
933*afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
934*afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
935aa372e3fSPaul Mullowney 
936aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
937*afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
938*afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
939*afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
940*afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
941*afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
942*afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
943*afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
944*afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
945*afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
946*afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
947*afb2bd1cSJunchao Zhang   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
948*afb2bd1cSJunchao Zhang #endif
949*afb2bd1cSJunchao Zhang 
950aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
951aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
952aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
953aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
954aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
955aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
956*afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
957*afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
958*afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
959*afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
960*afb2bd1cSJunchao Zhang                         #else
961*afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
962*afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
963*afb2bd1cSJunchao Zhang                         #endif
964*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
965aa372e3fSPaul Mullowney 
966*afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
967*afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
968*afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
969*afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
970*afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
971*afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
972*afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
973*afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
974*afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
975*afb2bd1cSJunchao Zhang #endif
976*afb2bd1cSJunchao Zhang 
977*afb2bd1cSJunchao Zhang   /* perform the solve analysis */
978aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
979*afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980*afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981*afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
982*afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
983*afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
984*afb2bd1cSJunchao Zhang                           #endif
985*afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
986aa372e3fSPaul Mullowney 
987aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
988aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
989aa372e3fSPaul Mullowney 
990aa372e3fSPaul Mullowney   /*********************************************/
991aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
992aa372e3fSPaul Mullowney   /*********************************************/
993aa372e3fSPaul Mullowney 
994aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
995aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
996aa372e3fSPaul Mullowney 
997aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
998aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
999aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1000aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1003aa372e3fSPaul Mullowney 
1004aa372e3fSPaul Mullowney   /* Create the matrix description */
100557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
100657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
100757d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
100857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
100957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1010aa372e3fSPaul Mullowney 
1011aa372e3fSPaul Mullowney   /* set the operation */
1012aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1013aa372e3fSPaul Mullowney 
1014aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1015aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1016*afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1017*afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1018aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1019*afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1020*afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1021*afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1022aa372e3fSPaul Mullowney 
1023aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1024*afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025*afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1026*afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1027*afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1028*afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1029*afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1030*afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1031*afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032*afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1033*afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1034*afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035*afb2bd1cSJunchao Zhang #endif
1036*afb2bd1cSJunchao Zhang 
1037aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1038aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1039aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1040aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1041aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1042aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1043*afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1044*afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045*afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1046*afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1047*afb2bd1cSJunchao Zhang                         #else
1048*afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1049*afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1050*afb2bd1cSJunchao Zhang                         #endif
1051*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1052aa372e3fSPaul Mullowney 
1053*afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1054*afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1055*afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1056*afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1057*afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1058*afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1059*afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1060*afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1061*afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1062*afb2bd1cSJunchao Zhang   #endif
1063*afb2bd1cSJunchao Zhang 
1064*afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1065aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066*afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067*afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068*afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1069*afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070*afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1071*afb2bd1cSJunchao Zhang                           #endif
1072*afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1073aa372e3fSPaul Mullowney 
1074aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
1075aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1076bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1077bda325fcSPaul Mullowney }
1078bda325fcSPaul Mullowney 
1079b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1080bda325fcSPaul Mullowney {
1081aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1082aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1083aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1084bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1085bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1086aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1087b06137fdSPaul Mullowney   cudaError_t                  err;
108885ba7357SStefano Zampini   PetscErrorCode               ierr;
1089b175d8bbSPaul Mullowney 
1090bda325fcSPaul Mullowney   PetscFunctionBegin;
109185ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
109285ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
109385ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
109485ba7357SStefano Zampini   /* create cusparse matrix */
1095aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
109657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1097aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
109857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
109957d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1100aa372e3fSPaul Mullowney 
1101b06137fdSPaul Mullowney   /* set alpha and beta */
1102*afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
11037656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
11047656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1105*afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
11067656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
11077656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
110857d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1109b06137fdSPaul Mullowney 
1110aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1111aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1112aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1113554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1114554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1115aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1116a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1117aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1118aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1119a3fdcf43SKarl Rupp 
112081902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
112181902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1122*afb2bd1cSJunchao Zhang 
112381902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1124*afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125*afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1126*afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1127*afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1128*afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1129*afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1130*afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1131*afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1132*afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1133*afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134*afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1135*afb2bd1cSJunchao Zhang    #endif
1136*afb2bd1cSJunchao Zhang 
1137a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1138a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1139aa372e3fSPaul Mullowney                             matrix->values->data().get(),
114081902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1141aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1142aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1143*afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144*afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1145*afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1146*afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1147*afb2bd1cSJunchao Zhang                           #else
1148*afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1149*afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1150*afb2bd1cSJunchao Zhang                           #endif
1151*afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
1152aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1153*afb2bd1cSJunchao Zhang 
1154*afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1155*afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1156*afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1157*afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1158*afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1159*afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1160*afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1161*afb2bd1cSJunchao Zhang    #endif
1162aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1163*afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164*afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1165*afb2bd1cSJunchao Zhang    #else
1166aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
116751c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
116851c6d536SStefano Zampini     /* First convert HYB to CSR */
1169aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1170aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1171aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1172aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1173aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1174aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1175aa372e3fSPaul Mullowney 
1176aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1177aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1178aa372e3fSPaul Mullowney                             temp->values->data().get(),
1179aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
118057d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1181aa372e3fSPaul Mullowney 
1182aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1183aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1184aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1185aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1186aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1187aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1188aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1189aa372e3fSPaul Mullowney 
1190aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1191aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1192aa372e3fSPaul Mullowney                             temp->values->data().get(),
1193aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1194aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1195aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1196aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1197aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
119857d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1199aa372e3fSPaul Mullowney 
1200aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1201aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
120257d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1203aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1204aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1205aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1206aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1207aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1208aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
120957d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1210aa372e3fSPaul Mullowney 
1211aa372e3fSPaul Mullowney     /* assign the pointer */
1212aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1213aa372e3fSPaul Mullowney     /* delete temporaries */
1214aa372e3fSPaul Mullowney     if (tempT) {
1215aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1216aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1217aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1218aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1219087f3262SPaul Mullowney     }
1220aa372e3fSPaul Mullowney     if (temp) {
1221aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1222aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1223aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1224aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1225aa372e3fSPaul Mullowney     }
1226*afb2bd1cSJunchao Zhang    #endif
1227aa372e3fSPaul Mullowney   }
122805035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
122985ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
123085ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1231213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1232213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1233aa372e3fSPaul Mullowney   /* assign the pointer */
1234aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1235bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1236bda325fcSPaul Mullowney }
1237bda325fcSPaul Mullowney 
12384e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
12396fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1240bda325fcSPaul Mullowney {
1241c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1242465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1243465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1244465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1245465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1246bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1247bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1248aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1249aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1250aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1251b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
125257d48284SJunchao Zhang   cudaError_t                           cerr;
1253bda325fcSPaul Mullowney 
1254bda325fcSPaul Mullowney   PetscFunctionBegin;
1255aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1256aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1257bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1258aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1259aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1260bda325fcSPaul Mullowney   }
1261bda325fcSPaul Mullowney 
1262bda325fcSPaul Mullowney   /* Get the GPU pointers */
1263c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1264c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1265c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1266c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1267bda325fcSPaul Mullowney 
12687a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1269aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1270c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1271c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1272c41cb2e2SAlejandro Lamas Daviña                xGPU);
1273aa372e3fSPaul Mullowney 
1274aa372e3fSPaul Mullowney   /* First, solve U */
1275aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1276*afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
1277*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1278*afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1279*afb2bd1cSJunchao Zhang                       #endif
1280*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1281aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1282aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1283aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1284aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1285*afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
1286*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287*afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1288*afb2bd1cSJunchao Zhang                       #endif
1289*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1290aa372e3fSPaul Mullowney 
1291aa372e3fSPaul Mullowney   /* Then, solve L */
1292aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1293*afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
1294*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1295*afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1296*afb2bd1cSJunchao Zhang                       #endif
1297*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1298aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1299aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1300aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1301aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1302*afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1303*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304*afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1305*afb2bd1cSJunchao Zhang                       #endif
1306*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1307aa372e3fSPaul Mullowney 
1308aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1309c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1310c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1311aa372e3fSPaul Mullowney                tempGPU->begin());
1312aa372e3fSPaul Mullowney 
1313aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1314c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1315bda325fcSPaul Mullowney 
1316bda325fcSPaul Mullowney   /* restore */
1317c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1318c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
131905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1320661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1321958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1322bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1323bda325fcSPaul Mullowney }
1324bda325fcSPaul Mullowney 
13256fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1326bda325fcSPaul Mullowney {
1327465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1328465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1329bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1330bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1331aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1332aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1333aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1334b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
133557d48284SJunchao Zhang   cudaError_t                       cerr;
1336bda325fcSPaul Mullowney 
1337bda325fcSPaul Mullowney   PetscFunctionBegin;
1338aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1339aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1340bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1341aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1342aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1343bda325fcSPaul Mullowney   }
1344bda325fcSPaul Mullowney 
1345bda325fcSPaul Mullowney   /* Get the GPU pointers */
1346c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1347c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1348bda325fcSPaul Mullowney 
13497a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1350aa372e3fSPaul Mullowney   /* First, solve U */
1351aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1352*afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
1353*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354*afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1355*afb2bd1cSJunchao Zhang                       #endif
1356*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1357aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1358aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1359aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1360aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1361*afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
1362*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363*afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1364*afb2bd1cSJunchao Zhang                       #endif
1365*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1366aa372e3fSPaul Mullowney 
1367aa372e3fSPaul Mullowney   /* Then, solve L */
1368aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1369*afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
1370*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371*afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1372*afb2bd1cSJunchao Zhang                       #endif
1373*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1374aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1375aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1376aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1377aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1378*afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1379*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380*afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1381*afb2bd1cSJunchao Zhang                       #endif
1382*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1383bda325fcSPaul Mullowney 
1384bda325fcSPaul Mullowney   /* restore */
1385c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1386c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
138705035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1388661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1389958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1390bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1391bda325fcSPaul Mullowney }
1392bda325fcSPaul Mullowney 
13936fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
13949ae82921SPaul Mullowney {
1395465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1396465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1397465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1398465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
13999ae82921SPaul Mullowney   cusparseStatus_t                      stat;
14009ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1401aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1402aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1403aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1404b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
140557d48284SJunchao Zhang   cudaError_t                           cerr;
14069ae82921SPaul Mullowney 
14079ae82921SPaul Mullowney   PetscFunctionBegin;
1408ebc8f436SDominic Meiser 
1409e057df02SPaul Mullowney   /* Get the GPU pointers */
1410c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1411c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1412c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1413c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
14149ae82921SPaul Mullowney 
14157a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1416aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1417c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1418c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
14194e4bbfaaSStefano Zampini                tempGPU->begin());
1420aa372e3fSPaul Mullowney 
1421aa372e3fSPaul Mullowney   /* Next, solve L */
1422aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1423*afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
1424*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1425*afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1426*afb2bd1cSJunchao Zhang                       #endif
1427*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1428aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1429aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1430aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1431aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1432*afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1433*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434*afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1435*afb2bd1cSJunchao Zhang                       #endif
1436*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1437aa372e3fSPaul Mullowney 
1438aa372e3fSPaul Mullowney   /* Then, solve U */
1439aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1440*afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
1441*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1442*afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1443*afb2bd1cSJunchao Zhang                       #endif
1444*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1445aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1446aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1447aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1448aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1449*afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
1450*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451*afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1452*afb2bd1cSJunchao Zhang                       #endif
1453*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1454aa372e3fSPaul Mullowney 
14554e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
14564e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
14574e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
14584e4bbfaaSStefano Zampini                xGPU);
14599ae82921SPaul Mullowney 
1460c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1461c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
146205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1463661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1464958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
14659ae82921SPaul Mullowney   PetscFunctionReturn(0);
14669ae82921SPaul Mullowney }
14679ae82921SPaul Mullowney 
14686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
14699ae82921SPaul Mullowney {
1470465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1471465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
14729ae82921SPaul Mullowney   cusparseStatus_t                  stat;
14739ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1474aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1475aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1476aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1477b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
147857d48284SJunchao Zhang   cudaError_t                       cerr;
14799ae82921SPaul Mullowney 
14809ae82921SPaul Mullowney   PetscFunctionBegin;
1481e057df02SPaul Mullowney   /* Get the GPU pointers */
1482c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1483c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
14849ae82921SPaul Mullowney 
14857a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1486aa372e3fSPaul Mullowney   /* First, solve L */
1487aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1488*afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
1489*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1490*afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1491*afb2bd1cSJunchao Zhang                       #endif
1492*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1493aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1494aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1495aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1496aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1497*afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
1498*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499*afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1500*afb2bd1cSJunchao Zhang                       #endif
1501*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1502aa372e3fSPaul Mullowney 
1503aa372e3fSPaul Mullowney   /* Next, solve U */
1504aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1505*afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
1506*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1507*afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1508*afb2bd1cSJunchao Zhang                       #endif
1509*afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1510aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1511aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1512aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1513aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1514*afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1515*afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516*afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1517*afb2bd1cSJunchao Zhang                       #endif
1518*afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
15199ae82921SPaul Mullowney 
1520c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1521c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
152205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1523661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1524958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15259ae82921SPaul Mullowney   PetscFunctionReturn(0);
15269ae82921SPaul Mullowney }
15279ae82921SPaul Mullowney 
15286fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
15299ae82921SPaul Mullowney {
1530aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
15317c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
15329ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1533213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
15349ae82921SPaul Mullowney   PetscErrorCode               ierr;
1535aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1536b06137fdSPaul Mullowney   cudaError_t                  err;
15379ae82921SPaul Mullowney 
15389ae82921SPaul Mullowney   PetscFunctionBegin;
153995639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1540c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
154181902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
154281902715SJunchao Zhang       /* Copy values only */
1543*afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1544*afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
154585ba7357SStefano Zampini 
154685ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
154785ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1548*afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
154905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
155085ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
15514863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
155285ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
155381902715SJunchao Zhang 
155481902715SJunchao Zhang       /* Update matT when it was built before */
155581902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
155681902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1557*afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
155885ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
155985ba7357SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156081902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1561*afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1562*afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
156381902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1564*afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1565*afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1566*afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1567*afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1568*afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1569*afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1570*afb2bd1cSJunchao Zhang                           #else
1571*afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1572*afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1573*afb2bd1cSJunchao Zhang                           #endif
1574*afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
157505035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
157685ba7357SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
157785ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
157881902715SJunchao Zhang       }
157934d6c7a5SJose E. Roman     } else {
158085ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
158185ba7357SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
15827c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
15837c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
15847c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
158581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
15869ae82921SPaul Mullowney       try {
15879ae82921SPaul Mullowney         if (a->compressedrow.use) {
15889ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
15899ae82921SPaul Mullowney           ii   = a->compressedrow.i;
15909ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
15919ae82921SPaul Mullowney         } else {
1592213423ffSJunchao Zhang           m    = A->rmap->n;
1593213423ffSJunchao Zhang           ii   = a->i;
1594e6e9a74fSStefano Zampini           ridx = NULL;
15959ae82921SPaul Mullowney         }
1596213423ffSJunchao Zhang         cusparsestruct->nrows = m;
15979ae82921SPaul Mullowney 
159885ba7357SStefano Zampini         /* create cusparse matrix */
1599aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
160057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
160157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
160257d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
16039ae82921SPaul Mullowney 
1604*afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
16057656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
16067656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1607*afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
16087656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
16097656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
161057d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1611b06137fdSPaul Mullowney 
1612aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1613aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1614aa372e3fSPaul Mullowney           /* set the matrix */
1615*afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1616*afb2bd1cSJunchao Zhang           mat->num_rows = m;
1617*afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1618*afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1619*afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1620*afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
16219ae82921SPaul Mullowney 
1622*afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1623*afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1624aa372e3fSPaul Mullowney 
1625*afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1626*afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1627aa372e3fSPaul Mullowney 
1628aa372e3fSPaul Mullowney           /* assign the pointer */
1629*afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1630*afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1631*afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1632*afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1633*afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1634*afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1635*afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1636*afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1637*afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1638*afb2bd1cSJunchao Zhang           }
1639*afb2bd1cSJunchao Zhang          #endif
1640aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1641*afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1642*afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1643*afb2bd1cSJunchao Zhang          #else
1644*afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1645*afb2bd1cSJunchao Zhang           mat->num_rows = m;
1646*afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1647*afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1648*afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1649*afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1650aa372e3fSPaul Mullowney 
1651*afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1652*afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1653aa372e3fSPaul Mullowney 
1654*afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1655*afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1656aa372e3fSPaul Mullowney 
1657aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
165857d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1659aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1660aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1661*afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1662*afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1663*afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1664*afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
166557d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1666aa372e3fSPaul Mullowney           /* assign the pointer */
1667aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1668aa372e3fSPaul Mullowney 
1669*afb2bd1cSJunchao Zhang           if (mat) {
1670*afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1671*afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1672*afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1673*afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1674087f3262SPaul Mullowney           }
1675*afb2bd1cSJunchao Zhang          #endif
1676087f3262SPaul Mullowney         }
1677ca45077fSPaul Mullowney 
1678aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1679213423ffSJunchao Zhang         if (a->compressedrow.use) {
1680213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1681aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1682aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1683213423ffSJunchao Zhang           tmp = m;
1684213423ffSJunchao Zhang         } else {
1685213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1686213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1687213423ffSJunchao Zhang           tmp = 0;
1688213423ffSJunchao Zhang         }
1689213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1690aa372e3fSPaul Mullowney 
1691aa372e3fSPaul Mullowney         /* assign the pointer */
1692aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
16939ae82921SPaul Mullowney       } catch(char *ex) {
16949ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
16959ae82921SPaul Mullowney       }
169605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
169785ba7357SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
169885ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
169934d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
170034d6c7a5SJose E. Roman     }
1701c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
17029ae82921SPaul Mullowney   }
17039ae82921SPaul Mullowney   PetscFunctionReturn(0);
17049ae82921SPaul Mullowney }
17059ae82921SPaul Mullowney 
1706c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1707aa372e3fSPaul Mullowney {
1708aa372e3fSPaul Mullowney   template <typename Tuple>
1709aa372e3fSPaul Mullowney   __host__ __device__
1710aa372e3fSPaul Mullowney   void operator()(Tuple t)
1711aa372e3fSPaul Mullowney   {
1712aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1713aa372e3fSPaul Mullowney   }
1714aa372e3fSPaul Mullowney };
1715aa372e3fSPaul Mullowney 
1716e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1717e6e9a74fSStefano Zampini {
1718e6e9a74fSStefano Zampini   template <typename Tuple>
1719e6e9a74fSStefano Zampini   __host__ __device__
1720e6e9a74fSStefano Zampini   void operator()(Tuple t)
1721e6e9a74fSStefano Zampini   {
1722e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1723e6e9a74fSStefano Zampini   }
1724e6e9a74fSStefano Zampini };
1725e6e9a74fSStefano Zampini 
1726*afb2bd1cSJunchao Zhang struct MatMatCusparse {
1727ccdfe979SStefano Zampini   PetscBool            cisdense;
1728ccdfe979SStefano Zampini   PetscScalar          *Bt;
1729ccdfe979SStefano Zampini   Mat                  X;
1730*afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1731*afb2bd1cSJunchao Zhang   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1732*afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
1733*afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
1734*afb2bd1cSJunchao Zhang   size_t               spmmBufferSize;
1735*afb2bd1cSJunchao Zhang   void                 *spmmBuffer;
1736*afb2bd1cSJunchao Zhang   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1737*afb2bd1cSJunchao Zhang #endif
1738*afb2bd1cSJunchao Zhang };
1739ccdfe979SStefano Zampini 
1740ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1741ccdfe979SStefano Zampini {
1742ccdfe979SStefano Zampini   PetscErrorCode ierr;
1743ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1744ccdfe979SStefano Zampini   cudaError_t    cerr;
1745ccdfe979SStefano Zampini 
1746ccdfe979SStefano Zampini   PetscFunctionBegin;
1747ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1748*afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1749*afb2bd1cSJunchao Zhang   cusparseStatus_t stat;
1750*afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1751*afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1752*afb2bd1cSJunchao Zhang   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1753*afb2bd1cSJunchao Zhang  #endif
1754ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1755ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1756ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1757ccdfe979SStefano Zampini }
1758ccdfe979SStefano Zampini 
1759ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1760ccdfe979SStefano Zampini 
1761ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1762ccdfe979SStefano Zampini {
1763ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1764ccdfe979SStefano Zampini   Mat                          A,B;
1765*afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1766ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1767ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1768ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1769ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1770ccdfe979SStefano Zampini   const PetscScalar            *barray;
1771ccdfe979SStefano Zampini   PetscScalar                  *carray;
1772ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1773ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1774ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1775ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1776*afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1777ccdfe979SStefano Zampini 
1778ccdfe979SStefano Zampini   PetscFunctionBegin;
1779ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1780ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1781ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1782ccdfe979SStefano Zampini   A    = product->A;
1783ccdfe979SStefano Zampini   B    = product->B;
1784ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1785ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1786ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1787ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1788ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1789ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1790ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1791ccdfe979SStefano Zampini   switch (product->type) {
1792ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1793ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1794ccdfe979SStefano Zampini     mat = cusp->mat;
1795ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1796ccdfe979SStefano Zampini     m   = A->rmap->n;
1797ccdfe979SStefano Zampini     n   = B->cmap->n;
1798ccdfe979SStefano Zampini     break;
1799ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1800e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1801e6e9a74fSStefano Zampini       mat = cusp->mat;
1802e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1803e6e9a74fSStefano Zampini     } else {
1804ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1805ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1806ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1807e6e9a74fSStefano Zampini     }
1808ccdfe979SStefano Zampini     m = A->cmap->n;
1809ccdfe979SStefano Zampini     n = B->cmap->n;
1810ccdfe979SStefano Zampini     break;
1811ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1812ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1813ccdfe979SStefano Zampini     mat = cusp->mat;
1814ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1815ccdfe979SStefano Zampini     m   = A->rmap->n;
1816ccdfe979SStefano Zampini     n   = B->rmap->n;
1817ccdfe979SStefano Zampini     break;
1818ccdfe979SStefano Zampini   default:
1819ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1820ccdfe979SStefano Zampini   }
1821ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1822ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1823ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1824ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1825*afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1826ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1827*afb2bd1cSJunchao Zhang 
1828ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1829c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1830c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1831c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1832c8378d12SStefano Zampini   } else {
1833c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1834c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1835c8378d12SStefano Zampini   }
1836c8378d12SStefano Zampini 
1837c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1838*afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1839*afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1840*afb2bd1cSJunchao Zhang   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1841*afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1842*afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1843*afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
1844*afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1845*afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
1846*afb2bd1cSJunchao Zhang     }
1847c8378d12SStefano Zampini 
1848*afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1849*afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1850*afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1851*afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
1852*afb2bd1cSJunchao Zhang     }
1853*afb2bd1cSJunchao Zhang 
1854*afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
1855*afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
1856*afb2bd1cSJunchao Zhang                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1857*afb2bd1cSJunchao Zhang                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1858*afb2bd1cSJunchao Zhang                               csrmat->values->data().get(),
1859*afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1860*afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1861*afb2bd1cSJunchao Zhang     }
1862*afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1863*afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1864*afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
1865*afb2bd1cSJunchao Zhang                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1866*afb2bd1cSJunchao Zhang     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1867*afb2bd1cSJunchao Zhang     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1868*afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
1869*afb2bd1cSJunchao Zhang   } else {
1870*afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
1871*afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1872*afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1873*afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1874*afb2bd1cSJunchao Zhang   }
1875*afb2bd1cSJunchao Zhang 
1876*afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
1877*afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1878*afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1879*afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
1880*afb2bd1cSJunchao Zhang                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1881*afb2bd1cSJunchao Zhang  #else
1882*afb2bd1cSJunchao Zhang   PetscInt k;
1883*afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
1884ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1885ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
1886ccdfe979SStefano Zampini     cublasStatus_t cerr;
1887ccdfe979SStefano Zampini 
1888ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1889ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1890ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
1891ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
1892ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
1893ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1894ccdfe979SStefano Zampini     blda = B->cmap->n;
1895*afb2bd1cSJunchao Zhang     k    = B->cmap->n;
1896*afb2bd1cSJunchao Zhang   } else {
1897*afb2bd1cSJunchao Zhang     k    = B->rmap->n;
1898ccdfe979SStefano Zampini   }
1899ccdfe979SStefano Zampini 
1900*afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1901ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1902*afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
1903ccdfe979SStefano Zampini                            csrmat->values->data().get(),
1904ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
1905ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
1906ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1907ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
1908*afb2bd1cSJunchao Zhang  #endif
1909*afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1910c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1911c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1912ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1913ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
1914ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1915ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1916ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
1917ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1918ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1919ccdfe979SStefano Zampini   } else {
1920ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1921ccdfe979SStefano Zampini   }
1922ccdfe979SStefano Zampini   if (mmdata->cisdense) {
1923ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1924ccdfe979SStefano Zampini   }
1925ccdfe979SStefano Zampini   if (!biscuda) {
1926ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1927ccdfe979SStefano Zampini   }
1928ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1929ccdfe979SStefano Zampini }
1930ccdfe979SStefano Zampini 
1931ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1932ccdfe979SStefano Zampini {
1933ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
1934ccdfe979SStefano Zampini   Mat                A,B;
1935ccdfe979SStefano Zampini   PetscInt           m,n;
1936ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
1937ccdfe979SStefano Zampini   PetscErrorCode     ierr;
1938ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
1939ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
1940ccdfe979SStefano Zampini 
1941ccdfe979SStefano Zampini   PetscFunctionBegin;
1942ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1943ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1944ccdfe979SStefano Zampini   A    = product->A;
1945ccdfe979SStefano Zampini   B    = product->B;
1946ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1947ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1948ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1949ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1950ccdfe979SStefano Zampini   switch (product->type) {
1951ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1952ccdfe979SStefano Zampini     m = A->rmap->n;
1953ccdfe979SStefano Zampini     n = B->cmap->n;
1954ccdfe979SStefano Zampini     break;
1955ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1956ccdfe979SStefano Zampini     m = A->cmap->n;
1957ccdfe979SStefano Zampini     n = B->cmap->n;
1958ccdfe979SStefano Zampini     break;
1959ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1960ccdfe979SStefano Zampini     m = A->rmap->n;
1961ccdfe979SStefano Zampini     n = B->rmap->n;
1962ccdfe979SStefano Zampini     break;
1963ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1964ccdfe979SStefano Zampini     m = B->cmap->n;
1965ccdfe979SStefano Zampini     n = B->cmap->n;
1966ccdfe979SStefano Zampini     break;
1967ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1968ccdfe979SStefano Zampini     m = B->rmap->n;
1969ccdfe979SStefano Zampini     n = B->rmap->n;
1970ccdfe979SStefano Zampini     break;
1971ccdfe979SStefano Zampini   default:
1972ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1973ccdfe979SStefano Zampini   }
1974ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1975ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1976ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1977ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1978ccdfe979SStefano Zampini 
1979ccdfe979SStefano Zampini   /* product data */
1980ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1981ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
1982*afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1983*afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1984ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1985*afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1986ccdfe979SStefano Zampini   }
1987*afb2bd1cSJunchao Zhang  #endif
1988ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
1989ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1990ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1991ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1992ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1993ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1994ccdfe979SStefano Zampini     } else {
1995ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1996ccdfe979SStefano Zampini     }
1997ccdfe979SStefano Zampini   }
1998ccdfe979SStefano Zampini   C->product->data    = mmdata;
1999ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2000ccdfe979SStefano Zampini 
2001ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2002ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2003ccdfe979SStefano Zampini }
2004ccdfe979SStefano Zampini 
2005ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2006ccdfe979SStefano Zampini 
2007ccdfe979SStefano Zampini /* handles dense B */
2008ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2009ccdfe979SStefano Zampini {
2010ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
2011ccdfe979SStefano Zampini   PetscErrorCode ierr;
2012ccdfe979SStefano Zampini 
2013ccdfe979SStefano Zampini   PetscFunctionBegin;
2014ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2015ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2016ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
2017ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2018ccdfe979SStefano Zampini     PetscFunctionReturn(0);
2019ccdfe979SStefano Zampini   }
2020ccdfe979SStefano Zampini   switch (product->type) {
2021ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2022ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2023ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2024ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2025ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2026ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2027ccdfe979SStefano Zampini   default:
2028ccdfe979SStefano Zampini     break;
2029ccdfe979SStefano Zampini   }
2030ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2031ccdfe979SStefano Zampini }
2032ccdfe979SStefano Zampini 
20336fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
20349ae82921SPaul Mullowney {
2035b175d8bbSPaul Mullowney   PetscErrorCode ierr;
20369ae82921SPaul Mullowney 
20379ae82921SPaul Mullowney   PetscFunctionBegin;
2038e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2039e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2040e6e9a74fSStefano Zampini }
2041e6e9a74fSStefano Zampini 
2042e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2043e6e9a74fSStefano Zampini {
2044e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2045e6e9a74fSStefano Zampini 
2046e6e9a74fSStefano Zampini   PetscFunctionBegin;
2047e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2048e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2049e6e9a74fSStefano Zampini }
2050e6e9a74fSStefano Zampini 
2051e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2052e6e9a74fSStefano Zampini {
2053e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2054e6e9a74fSStefano Zampini 
2055e6e9a74fSStefano Zampini   PetscFunctionBegin;
2056e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2057e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2058e6e9a74fSStefano Zampini }
2059e6e9a74fSStefano Zampini 
2060e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2061e6e9a74fSStefano Zampini {
2062e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2063e6e9a74fSStefano Zampini 
2064e6e9a74fSStefano Zampini   PetscFunctionBegin;
2065e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
20669ae82921SPaul Mullowney   PetscFunctionReturn(0);
20679ae82921SPaul Mullowney }
20689ae82921SPaul Mullowney 
20696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2070ca45077fSPaul Mullowney {
2071b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2072ca45077fSPaul Mullowney 
2073ca45077fSPaul Mullowney   PetscFunctionBegin;
2074e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2075ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2076ca45077fSPaul Mullowney }
2077ca45077fSPaul Mullowney 
2078*afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2079e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
20809ae82921SPaul Mullowney {
20819ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2082aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
20839ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2084e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2085b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
208657d48284SJunchao Zhang   cudaError_t                  cerr;
2087aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2088e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2089e6e9a74fSStefano Zampini   PetscBool                    compressed;
2090*afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2091*afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2092*afb2bd1cSJunchao Zhang #endif
20936e111a19SKarl Rupp 
20949ae82921SPaul Mullowney   PetscFunctionBegin;
2095e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2096e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2097*afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2098e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2099e6e9a74fSStefano Zampini   }
210034d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
210134d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2102e6e9a74fSStefano Zampini   if (!trans) {
21039ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2104e6e9a74fSStefano Zampini   } else {
2105e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2106e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2107e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2108e6e9a74fSStefano Zampini     } else {
2109*afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2110e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2111e6e9a74fSStefano Zampini     }
2112e6e9a74fSStefano Zampini   }
2113e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2114e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2115213423ffSJunchao Zhang 
2116e6e9a74fSStefano Zampini   try {
2117e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2118213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2119213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2120*afb2bd1cSJunchao Zhang 
212185ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2122e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2123*afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2124*afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2125*afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2126*afb2bd1cSJunchao Zhang       */
2127e6e9a74fSStefano Zampini       xptr = xarray;
2128*afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2129213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2130*afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2131*afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2132*afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2133*afb2bd1cSJunchao Zhang        */
2134*afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2135*afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2136*afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2137*afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2138*afb2bd1cSJunchao Zhang       }
2139*afb2bd1cSJunchao Zhang      #endif
2140e6e9a74fSStefano Zampini     } else {
2141*afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2142*afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2143*afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2144*afb2bd1cSJunchao Zhang        */
2145*afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2146e6e9a74fSStefano Zampini       dptr = zarray;
2147e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2148*afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2149e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2150e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2151e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2152e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2153e6e9a74fSStefano Zampini       }
2154*afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2155*afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2156*afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2157*afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2158*afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2159*afb2bd1cSJunchao Zhang       }
2160*afb2bd1cSJunchao Zhang      #endif
2161e6e9a74fSStefano Zampini     }
21629ae82921SPaul Mullowney 
2163*afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2164aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2165*afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2166*afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2167*afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2168*afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2169*afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2170*afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2171*afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2172*afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2173*afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2174*afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2175*afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2176*afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2177*afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2178*afb2bd1cSJunchao Zhang 
2179*afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2180*afb2bd1cSJunchao Zhang       } else {
2181*afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2182*afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2183*afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2184*afb2bd1cSJunchao Zhang       }
2185*afb2bd1cSJunchao Zhang 
2186*afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2187*afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2188*afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2189*afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2190*afb2bd1cSJunchao Zhang                                beta,
2191*afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2192*afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2193*afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2194*afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2195*afb2bd1cSJunchao Zhang      #else
21967656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2197e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2198a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2199*afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2200aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2201e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
220257d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2203*afb2bd1cSJunchao Zhang      #endif
2204aa372e3fSPaul Mullowney     } else {
2205213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2206*afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2207*afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2208*afb2bd1cSJunchao Zhang        #else
2209301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2210e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2211*afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2212e6e9a74fSStefano Zampini                                  xptr, beta,
221357d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2214*afb2bd1cSJunchao Zhang        #endif
2215a65300a6SPaul Mullowney       }
2216aa372e3fSPaul Mullowney     }
221705035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2218958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2219aa372e3fSPaul Mullowney 
2220e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2221213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2222213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2223213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2224e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2225213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
22267656d835SStefano Zampini         }
2227213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2228c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
22297656d835SStefano Zampini       }
22307656d835SStefano Zampini 
2231213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2232213423ffSJunchao Zhang       if (compressed) {
2233213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2234e6e9a74fSStefano Zampini 
2235e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2236c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2237e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2238c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
223905035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2240958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2241e6e9a74fSStefano Zampini       }
2242e6e9a74fSStefano Zampini     } else {
2243e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2244e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2245e6e9a74fSStefano Zampini       }
2246e6e9a74fSStefano Zampini     }
2247e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2248213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2249213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
22509ae82921SPaul Mullowney   } catch(char *ex) {
22519ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
22529ae82921SPaul Mullowney   }
2253e6e9a74fSStefano Zampini   if (yy) {
2254958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2255e6e9a74fSStefano Zampini   } else {
2256e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2257e6e9a74fSStefano Zampini   }
22589ae82921SPaul Mullowney   PetscFunctionReturn(0);
22599ae82921SPaul Mullowney }
22609ae82921SPaul Mullowney 
22616fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2262ca45077fSPaul Mullowney {
2263b175d8bbSPaul Mullowney   PetscErrorCode ierr;
22646e111a19SKarl Rupp 
2265ca45077fSPaul Mullowney   PetscFunctionBegin;
2266e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2267ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2268ca45077fSPaul Mullowney }
2269ca45077fSPaul Mullowney 
22706fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
22719ae82921SPaul Mullowney {
22729ae82921SPaul Mullowney   PetscErrorCode ierr;
22736e111a19SKarl Rupp 
22749ae82921SPaul Mullowney   PetscFunctionBegin;
22759ae82921SPaul Mullowney   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
227695639643SRichard Tran Mills   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2277bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2278e057df02SPaul Mullowney     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2279bc3f50f2SPaul Mullowney   }
22809ae82921SPaul Mullowney   PetscFunctionReturn(0);
22819ae82921SPaul Mullowney }
22829ae82921SPaul Mullowney 
22839ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2284e057df02SPaul Mullowney /*@
22859ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2286e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2287e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2288e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2289e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2290e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
22919ae82921SPaul Mullowney 
2292d083f849SBarry Smith    Collective
22939ae82921SPaul Mullowney 
22949ae82921SPaul Mullowney    Input Parameters:
22959ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
22969ae82921SPaul Mullowney .  m - number of rows
22979ae82921SPaul Mullowney .  n - number of columns
22989ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
22999ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
23000298fd71SBarry Smith          (possibly different for each row) or NULL
23019ae82921SPaul Mullowney 
23029ae82921SPaul Mullowney    Output Parameter:
23039ae82921SPaul Mullowney .  A - the matrix
23049ae82921SPaul Mullowney 
23059ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
23069ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
23079ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
23089ae82921SPaul Mullowney 
23099ae82921SPaul Mullowney    Notes:
23109ae82921SPaul Mullowney    If nnz is given then nz is ignored
23119ae82921SPaul Mullowney 
23129ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
23139ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
23149ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
23159ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
23169ae82921SPaul Mullowney 
23179ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
23180298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
23199ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
23209ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
23219ae82921SPaul Mullowney 
23229ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
23239ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
23249ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
23259ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
23269ae82921SPaul Mullowney 
23279ae82921SPaul Mullowney    Level: intermediate
23289ae82921SPaul Mullowney 
2329e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
23309ae82921SPaul Mullowney @*/
23319ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
23329ae82921SPaul Mullowney {
23339ae82921SPaul Mullowney   PetscErrorCode ierr;
23349ae82921SPaul Mullowney 
23359ae82921SPaul Mullowney   PetscFunctionBegin;
23369ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
23379ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
23389ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
23399ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
23409ae82921SPaul Mullowney   PetscFunctionReturn(0);
23419ae82921SPaul Mullowney }
23429ae82921SPaul Mullowney 
23436fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
23449ae82921SPaul Mullowney {
23459ae82921SPaul Mullowney   PetscErrorCode ierr;
2346ab25e6cbSDominic Meiser 
23479ae82921SPaul Mullowney   PetscFunctionBegin;
23489ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2349470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
23509ae82921SPaul Mullowney   } else {
2351470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2352aa372e3fSPaul Mullowney   }
2353ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2354ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2355ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2356ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
23579ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
23589ae82921SPaul Mullowney   PetscFunctionReturn(0);
23599ae82921SPaul Mullowney }
23609ae82921SPaul Mullowney 
2361ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
236295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
23639ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
23649ff858a8SKarl Rupp {
23659ff858a8SKarl Rupp   PetscErrorCode ierr;
23669ff858a8SKarl Rupp 
23679ff858a8SKarl Rupp   PetscFunctionBegin;
23689ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2369ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
23709ff858a8SKarl Rupp   PetscFunctionReturn(0);
23719ff858a8SKarl Rupp }
23729ff858a8SKarl Rupp 
237395639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
237495639643SRichard Tran Mills {
2375e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2376e6e9a74fSStefano Zampini 
237795639643SRichard Tran Mills   PetscFunctionBegin;
2378e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2379c34f1ff0SRichard Tran Mills   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
238095639643SRichard Tran Mills      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2381e6e9a74fSStefano Zampini      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
238295639643SRichard Tran Mills      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
238395639643SRichard Tran Mills            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2384e6e9a74fSStefano Zampini   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2385e6e9a74fSStefano Zampini   /* TODO: add support for this? */
238695639643SRichard Tran Mills   if (flg) {
238795639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
238895639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2389c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2390c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2391e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
2392e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
2393e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2394e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
239595639643SRichard Tran Mills   } else {
239695639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
239795639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
239895639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
239995639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2400e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2401e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2402e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2403e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
240495639643SRichard Tran Mills   }
240595639643SRichard Tran Mills   A->boundtocpu = flg;
240695639643SRichard Tran Mills   PetscFunctionReturn(0);
240795639643SRichard Tran Mills }
240895639643SRichard Tran Mills 
240949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
24109ae82921SPaul Mullowney {
24119ae82921SPaul Mullowney   PetscErrorCode   ierr;
2412aa372e3fSPaul Mullowney   cusparseStatus_t stat;
241349735bf3SStefano Zampini   Mat              B;
24149ae82921SPaul Mullowney 
24159ae82921SPaul Mullowney   PetscFunctionBegin;
241649735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
241749735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
241849735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
241949735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
242049735bf3SStefano Zampini   }
242149735bf3SStefano Zampini   B = *newmat;
242249735bf3SStefano Zampini 
242334136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
242434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
242534136279SStefano Zampini 
242649735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
24279ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2428e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2429e6e9a74fSStefano Zampini 
2430e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2431e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2432e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2433e6e9a74fSStefano Zampini       B->spptr = spptr;
24349ae82921SPaul Mullowney     } else {
2435e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2436e6e9a74fSStefano Zampini 
2437e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2438e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2439e6e9a74fSStefano Zampini       B->spptr = spptr;
24409ae82921SPaul Mullowney     }
2441e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
244249735bf3SStefano Zampini   }
2443693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
24449ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
24459ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
244695639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2447693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
24482205254eSKarl Rupp 
2449e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
24509ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2451bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
24529ae82921SPaul Mullowney   PetscFunctionReturn(0);
24539ae82921SPaul Mullowney }
24549ae82921SPaul Mullowney 
245502fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
245602fe1965SBarry Smith {
245702fe1965SBarry Smith   PetscErrorCode ierr;
245802fe1965SBarry Smith 
245902fe1965SBarry Smith   PetscFunctionBegin;
246005035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
246102fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
24620ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2463*afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2464*afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2465*afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
246602fe1965SBarry Smith   PetscFunctionReturn(0);
246702fe1965SBarry Smith }
246802fe1965SBarry Smith 
24693ca39a21SBarry Smith /*MC
2470e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2471e057df02SPaul Mullowney 
2472e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
24732692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
24742692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2475e057df02SPaul Mullowney 
2476e057df02SPaul Mullowney    Options Database Keys:
2477e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2478aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2479a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2480e057df02SPaul Mullowney 
2481e057df02SPaul Mullowney   Level: beginner
2482e057df02SPaul Mullowney 
24838468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2484e057df02SPaul Mullowney M*/
24857f756511SDominic Meiser 
248642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
248742c9c57cSBarry Smith 
24880f39cd5aSBarry Smith 
24893ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
249042c9c57cSBarry Smith {
249142c9c57cSBarry Smith   PetscErrorCode ierr;
249242c9c57cSBarry Smith 
249342c9c57cSBarry Smith   PetscFunctionBegin;
24943ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
24953ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
24963ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
24973ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
249842c9c57cSBarry Smith   PetscFunctionReturn(0);
249942c9c57cSBarry Smith }
250029b38603SBarry Smith 
2501470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
25027f756511SDominic Meiser {
2503e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
25047f756511SDominic Meiser   cusparseStatus_t stat;
25057f756511SDominic Meiser   cusparseHandle_t handle;
25067f756511SDominic Meiser 
25077f756511SDominic Meiser   PetscFunctionBegin;
25087f756511SDominic Meiser   if (*cusparsestruct) {
2509e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2510e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
25117f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
251281902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
2513*afb2bd1cSJunchao Zhang     if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2514*afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2515*afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2516*afb2bd1cSJunchao Zhang    #endif
2517e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
25187f756511SDominic Meiser   }
25197f756511SDominic Meiser   PetscFunctionReturn(0);
25207f756511SDominic Meiser }
25217f756511SDominic Meiser 
25227f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
25237f756511SDominic Meiser {
25247f756511SDominic Meiser   PetscFunctionBegin;
25257f756511SDominic Meiser   if (*mat) {
25267f756511SDominic Meiser     delete (*mat)->values;
25277f756511SDominic Meiser     delete (*mat)->column_indices;
25287f756511SDominic Meiser     delete (*mat)->row_offsets;
25297f756511SDominic Meiser     delete *mat;
25307f756511SDominic Meiser     *mat = 0;
25317f756511SDominic Meiser   }
25327f756511SDominic Meiser   PetscFunctionReturn(0);
25337f756511SDominic Meiser }
25347f756511SDominic Meiser 
2535470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
25367f756511SDominic Meiser {
25377f756511SDominic Meiser   cusparseStatus_t stat;
25387f756511SDominic Meiser   PetscErrorCode   ierr;
25397f756511SDominic Meiser 
25407f756511SDominic Meiser   PetscFunctionBegin;
25417f756511SDominic Meiser   if (*trifactor) {
254257d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2543*afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
25447f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2545*afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2546*afb2bd1cSJunchao Zhang     cudaError_t cerr;
2547*afb2bd1cSJunchao Zhang     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2548*afb2bd1cSJunchao Zhang     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2549*afb2bd1cSJunchao Zhang    #endif
25507f756511SDominic Meiser     delete *trifactor;
25517f756511SDominic Meiser     *trifactor = 0;
25527f756511SDominic Meiser   }
25537f756511SDominic Meiser   PetscFunctionReturn(0);
25547f756511SDominic Meiser }
25557f756511SDominic Meiser 
2556470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
25577f756511SDominic Meiser {
25587f756511SDominic Meiser   CsrMatrix        *mat;
25597f756511SDominic Meiser   cusparseStatus_t stat;
25607f756511SDominic Meiser   cudaError_t      err;
25617f756511SDominic Meiser 
25627f756511SDominic Meiser   PetscFunctionBegin;
25637f756511SDominic Meiser   if (*matstruct) {
25647f756511SDominic Meiser     if ((*matstruct)->mat) {
25657f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2566*afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2567*afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2568*afb2bd1cSJunchao Zhang        #else
25697f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
257057d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2571*afb2bd1cSJunchao Zhang        #endif
25727f756511SDominic Meiser       } else {
25737f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
25747f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
25757f756511SDominic Meiser       }
25767f756511SDominic Meiser     }
257757d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
25787f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2579*afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
25807656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
25817656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2582*afb2bd1cSJunchao Zhang 
2583*afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2584*afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2585*afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2586*afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
2587*afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
2588*afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2589*afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2590*afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2591*afb2bd1cSJunchao Zhang       }
2592*afb2bd1cSJunchao Zhang     }
2593*afb2bd1cSJunchao Zhang    #endif
25947f756511SDominic Meiser     delete *matstruct;
25957f756511SDominic Meiser     *matstruct = 0;
25967f756511SDominic Meiser   }
25977f756511SDominic Meiser   PetscFunctionReturn(0);
25987f756511SDominic Meiser }
25997f756511SDominic Meiser 
2600ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
26017f756511SDominic Meiser {
2602e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2603e6e9a74fSStefano Zampini 
26047f756511SDominic Meiser   PetscFunctionBegin;
26057f756511SDominic Meiser   if (*trifactors) {
2606e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2607e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2608e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2609e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
26107f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
26117f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
26127f756511SDominic Meiser     delete (*trifactors)->workVector;
2613ccdfe979SStefano Zampini     (*trifactors)->rpermIndices = 0;
2614ccdfe979SStefano Zampini     (*trifactors)->cpermIndices = 0;
2615ccdfe979SStefano Zampini     (*trifactors)->workVector = 0;
2616ccdfe979SStefano Zampini   }
2617ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2618ccdfe979SStefano Zampini }
2619ccdfe979SStefano Zampini 
2620ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2621ccdfe979SStefano Zampini {
2622e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2623ccdfe979SStefano Zampini   cusparseHandle_t handle;
2624ccdfe979SStefano Zampini   cusparseStatus_t stat;
2625ccdfe979SStefano Zampini 
2626ccdfe979SStefano Zampini   PetscFunctionBegin;
2627ccdfe979SStefano Zampini   if (*trifactors) {
2628e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
26297f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
263057d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
26317f756511SDominic Meiser     }
2632e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
26337f756511SDominic Meiser   }
26347f756511SDominic Meiser   PetscFunctionReturn(0);
26357f756511SDominic Meiser }
2636