xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 3fa6b06a16b9e10b6e41dcaa1d4ab6dd90b29d60)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
749ae82921SPaul Mullowney 
757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
817f756511SDominic Meiser 
82b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
83b06137fdSPaul Mullowney {
84b06137fdSPaul Mullowney   cusparseStatus_t   stat;
85b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
86b06137fdSPaul Mullowney 
87b06137fdSPaul Mullowney   PetscFunctionBegin;
88b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
8957d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
90b06137fdSPaul Mullowney   PetscFunctionReturn(0);
91b06137fdSPaul Mullowney }
92b06137fdSPaul Mullowney 
93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
94b06137fdSPaul Mullowney {
95b06137fdSPaul Mullowney   cusparseStatus_t   stat;
96b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97b06137fdSPaul Mullowney 
98b06137fdSPaul Mullowney   PetscFunctionBegin;
996b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10016a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
10157d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
10216a2e217SAlejandro Lamas Daviña     }
103b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1046b1cf21dSAlejandro Lamas Daviña   }
10557d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
106b06137fdSPaul Mullowney   PetscFunctionReturn(0);
107b06137fdSPaul Mullowney }
108b06137fdSPaul Mullowney 
109b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
110b06137fdSPaul Mullowney {
111b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
112ccdfe979SStefano Zampini 
113b06137fdSPaul Mullowney   PetscFunctionBegin;
114ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
115b06137fdSPaul Mullowney   PetscFunctionReturn(0);
116b06137fdSPaul Mullowney }
117b06137fdSPaul Mullowney 
118ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1199ae82921SPaul Mullowney {
1209ae82921SPaul Mullowney   PetscFunctionBegin;
1219ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1229ae82921SPaul Mullowney   PetscFunctionReturn(0);
1239ae82921SPaul Mullowney }
1249ae82921SPaul Mullowney 
125c708e6cdSJed Brown /*MC
126087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
127087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
128087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
129087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
130087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
131087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
132c708e6cdSJed Brown 
1339ae82921SPaul Mullowney   Level: beginner
134c708e6cdSJed Brown 
1353ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
136c708e6cdSJed Brown M*/
1379ae82921SPaul Mullowney 
13842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1399ae82921SPaul Mullowney {
1409ae82921SPaul Mullowney   PetscErrorCode ierr;
141bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1429ae82921SPaul Mullowney 
1439ae82921SPaul Mullowney   PetscFunctionBegin;
144bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
145bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1462c7c0729SBarry Smith   (*B)->factortype = ftype;
1472c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1489ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1492205254eSKarl Rupp 
150087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
15133d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1529ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1539ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
154087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
155087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
156087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1579ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
158bc3f50f2SPaul Mullowney 
159fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1603ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1619ae82921SPaul Mullowney   PetscFunctionReturn(0);
1629ae82921SPaul Mullowney }
1639ae82921SPaul Mullowney 
164bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
165ca45077fSPaul Mullowney {
166aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1676e111a19SKarl Rupp 
168ca45077fSPaul Mullowney   PetscFunctionBegin;
169ca45077fSPaul Mullowney   switch (op) {
170e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
171aa372e3fSPaul Mullowney     cusparsestruct->format = format;
172ca45077fSPaul Mullowney     break;
173e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
174aa372e3fSPaul Mullowney     cusparsestruct->format = format;
175ca45077fSPaul Mullowney     break;
176ca45077fSPaul Mullowney   default:
17736d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
178ca45077fSPaul Mullowney   }
179ca45077fSPaul Mullowney   PetscFunctionReturn(0);
180ca45077fSPaul Mullowney }
1819ae82921SPaul Mullowney 
182e057df02SPaul Mullowney /*@
183e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
184e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
185aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
186e057df02SPaul Mullowney    Not Collective
187e057df02SPaul Mullowney 
188e057df02SPaul Mullowney    Input Parameters:
1898468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
19036d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
1912692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
192e057df02SPaul Mullowney 
193e057df02SPaul Mullowney    Output Parameter:
194e057df02SPaul Mullowney 
195e057df02SPaul Mullowney    Level: intermediate
196e057df02SPaul Mullowney 
1978468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
198e057df02SPaul Mullowney @*/
199e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200e057df02SPaul Mullowney {
201e057df02SPaul Mullowney   PetscErrorCode ierr;
2026e111a19SKarl Rupp 
203e057df02SPaul Mullowney   PetscFunctionBegin;
204e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
205e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
206e057df02SPaul Mullowney   PetscFunctionReturn(0);
207e057df02SPaul Mullowney }
208e057df02SPaul Mullowney 
209e6e9a74fSStefano Zampini /*@
210e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
211e6e9a74fSStefano Zampini 
212e6e9a74fSStefano Zampini    Collective on mat
213e6e9a74fSStefano Zampini 
214e6e9a74fSStefano Zampini    Input Parameters:
215e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
216e6e9a74fSStefano Zampini -  transgen - the boolean flag
217e6e9a74fSStefano Zampini 
218e6e9a74fSStefano Zampini    Level: intermediate
219e6e9a74fSStefano Zampini 
220e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
221e6e9a74fSStefano Zampini @*/
222e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
223e6e9a74fSStefano Zampini {
224e6e9a74fSStefano Zampini   PetscErrorCode ierr;
225e6e9a74fSStefano Zampini   PetscBool      flg;
226e6e9a74fSStefano Zampini 
227e6e9a74fSStefano Zampini   PetscFunctionBegin;
228e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
229e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
230e6e9a74fSStefano Zampini   if (flg) {
231e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
23254da937aSStefano Zampini 
233e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
234e6e9a74fSStefano Zampini     cusp->transgen = transgen;
23554da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
23654da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
23754da937aSStefano Zampini     }
238e6e9a74fSStefano Zampini   }
239e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
240e6e9a74fSStefano Zampini }
241e6e9a74fSStefano Zampini 
2424416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2439ae82921SPaul Mullowney {
2449ae82921SPaul Mullowney   PetscErrorCode           ierr;
245e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2469ae82921SPaul Mullowney   PetscBool                flg;
247a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2486e111a19SKarl Rupp 
2499ae82921SPaul Mullowney   PetscFunctionBegin;
250e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2519ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
25254da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
25354da937aSStefano Zampini 
25454da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
255afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
256afb2bd1cSJunchao Zhang 
257e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
258a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
259afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
260afb2bd1cSJunchao Zhang 
2614c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
262a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
263afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
264afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
265afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
266afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
267afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
268afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
269afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
270afb2bd1cSJunchao Zhang 
271afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
272afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
273afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
274afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
275afb2bd1cSJunchao Zhang 
276afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
277afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
278afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
279afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
280afb2bd1cSJunchao Zhang    #endif
2814c87dfd4SPaul Mullowney   }
2820af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2839ae82921SPaul Mullowney   PetscFunctionReturn(0);
2849ae82921SPaul Mullowney }
2859ae82921SPaul Mullowney 
2866fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2879ae82921SPaul Mullowney {
2889ae82921SPaul Mullowney   PetscErrorCode ierr;
2899ae82921SPaul Mullowney 
2909ae82921SPaul Mullowney   PetscFunctionBegin;
2919ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2929ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2939ae82921SPaul Mullowney   PetscFunctionReturn(0);
2949ae82921SPaul Mullowney }
2959ae82921SPaul Mullowney 
2966fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2979ae82921SPaul Mullowney {
2989ae82921SPaul Mullowney   PetscErrorCode ierr;
2999ae82921SPaul Mullowney 
3009ae82921SPaul Mullowney   PetscFunctionBegin;
3019ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3029ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3039ae82921SPaul Mullowney   PetscFunctionReturn(0);
3049ae82921SPaul Mullowney }
3059ae82921SPaul Mullowney 
306087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
307087f3262SPaul Mullowney {
308087f3262SPaul Mullowney   PetscErrorCode ierr;
309087f3262SPaul Mullowney 
310087f3262SPaul Mullowney   PetscFunctionBegin;
311087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
312087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
313087f3262SPaul Mullowney   PetscFunctionReturn(0);
314087f3262SPaul Mullowney }
315087f3262SPaul Mullowney 
316087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
317087f3262SPaul Mullowney {
318087f3262SPaul Mullowney   PetscErrorCode ierr;
319087f3262SPaul Mullowney 
320087f3262SPaul Mullowney   PetscFunctionBegin;
321087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
322087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
323087f3262SPaul Mullowney   PetscFunctionReturn(0);
324087f3262SPaul Mullowney }
325087f3262SPaul Mullowney 
326087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3279ae82921SPaul Mullowney {
3289ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3299ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
331aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3329ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3339ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3349ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3359ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3369ae82921SPaul Mullowney   PetscScalar                       *AALo;
3379ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
338b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
33957d48284SJunchao Zhang   cudaError_t                       cerr;
3409ae82921SPaul Mullowney 
3419ae82921SPaul Mullowney   PetscFunctionBegin;
342cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
343c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3449ae82921SPaul Mullowney     try {
3459ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3469ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
3479ae82921SPaul Mullowney 
3489ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
34957d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
35057d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
35157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3529ae82921SPaul Mullowney 
3539ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
3549ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
3559ae82921SPaul Mullowney       AiLo[n]  = nzLower;
3569ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
3579ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
3589ae82921SPaul Mullowney       v        = aa;
3599ae82921SPaul Mullowney       vi       = aj;
3609ae82921SPaul Mullowney       offset   = 1;
3619ae82921SPaul Mullowney       rowOffset= 1;
3629ae82921SPaul Mullowney       for (i=1; i<n; i++) {
3639ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
364e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
3659ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
3669ae82921SPaul Mullowney         rowOffset += nz+1;
3679ae82921SPaul Mullowney 
368580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
369580bdb30SBarry Smith         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3709ae82921SPaul Mullowney 
3719ae82921SPaul Mullowney         offset      += nz;
3729ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3739ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3749ae82921SPaul Mullowney         offset      += 1;
3759ae82921SPaul Mullowney 
3769ae82921SPaul Mullowney         v  += nz;
3779ae82921SPaul Mullowney         vi += nz;
3789ae82921SPaul Mullowney       }
3792205254eSKarl Rupp 
380aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
381aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3822205254eSKarl Rupp 
383aa372e3fSPaul Mullowney       /* Create the matrix description */
38457d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
38557d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
386afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
387afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
388afb2bd1cSJunchao Zhang      #else
38957d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
390afb2bd1cSJunchao Zhang      #endif
39157d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
39257d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
393aa372e3fSPaul Mullowney 
394aa372e3fSPaul Mullowney       /* set the operation */
395aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
396aa372e3fSPaul Mullowney 
397aa372e3fSPaul Mullowney       /* set the matrix */
398aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
399aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
400aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
401aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
402aa372e3fSPaul Mullowney 
403aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
404aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
405aa372e3fSPaul Mullowney 
406aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
407aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
408aa372e3fSPaul Mullowney 
409aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
410aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
411aa372e3fSPaul Mullowney 
412afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
413afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
414afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
415afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
416afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
417afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
418afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
419afb2bd1cSJunchao Zhang                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
420afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
421afb2bd1cSJunchao Zhang     #endif
422afb2bd1cSJunchao Zhang 
423aa372e3fSPaul Mullowney       /* perform the solve analysis */
424aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
425aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427afb2bd1cSJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
428afb2bd1cSJunchao Zhang                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
429afb2bd1cSJunchao Zhang                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
430afb2bd1cSJunchao Zhang                              #endif
431afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
432aa372e3fSPaul Mullowney 
433aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
434aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4352205254eSKarl Rupp 
43657d48284SJunchao Zhang       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
43757d48284SJunchao Zhang       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
43857d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
4394863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
4409ae82921SPaul Mullowney     } catch(char *ex) {
4419ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4429ae82921SPaul Mullowney     }
4439ae82921SPaul Mullowney   }
4449ae82921SPaul Mullowney   PetscFunctionReturn(0);
4459ae82921SPaul Mullowney }
4469ae82921SPaul Mullowney 
447087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4489ae82921SPaul Mullowney {
4499ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4509ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4519ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
452aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4539ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4549ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4559ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4569ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4579ae82921SPaul Mullowney   PetscScalar                       *AAUp;
4589ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4599ae82921SPaul Mullowney   PetscErrorCode                    ierr;
46057d48284SJunchao Zhang   cudaError_t                       cerr;
4619ae82921SPaul Mullowney 
4629ae82921SPaul Mullowney   PetscFunctionBegin;
463cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
464c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4659ae82921SPaul Mullowney     try {
4669ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4679ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
4689ae82921SPaul Mullowney 
4699ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
47057d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
47157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
47257d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
4739ae82921SPaul Mullowney 
4749ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
4759ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
4769ae82921SPaul Mullowney       AiUp[n]=nzUpper;
4779ae82921SPaul Mullowney       offset = nzUpper;
4789ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
4799ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
4809ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
4819ae82921SPaul Mullowney 
482e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
4839ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
4849ae82921SPaul Mullowney 
485e057df02SPaul Mullowney         /* decrement the offset */
4869ae82921SPaul Mullowney         offset -= (nz+1);
4879ae82921SPaul Mullowney 
488e057df02SPaul Mullowney         /* first, set the diagonal elements */
4899ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
49009f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1./v[nz];
4919ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
4929ae82921SPaul Mullowney 
493580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
494580bdb30SBarry Smith         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
4959ae82921SPaul Mullowney       }
4962205254eSKarl Rupp 
497aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
498aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
4992205254eSKarl Rupp 
500aa372e3fSPaul Mullowney       /* Create the matrix description */
50157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
50257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
503afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
504afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
505afb2bd1cSJunchao Zhang      #else
50657d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
507afb2bd1cSJunchao Zhang      #endif
50857d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
50957d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
510aa372e3fSPaul Mullowney 
511aa372e3fSPaul Mullowney       /* set the operation */
512aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
513aa372e3fSPaul Mullowney 
514aa372e3fSPaul Mullowney       /* set the matrix */
515aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
516aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
517aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
518aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
519aa372e3fSPaul Mullowney 
520aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
521aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
522aa372e3fSPaul Mullowney 
523aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
524aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
525aa372e3fSPaul Mullowney 
526aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
527aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
528aa372e3fSPaul Mullowney 
529afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
530afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
531afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
532afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
533afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
534afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
535afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
536afb2bd1cSJunchao Zhang                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
537afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
538afb2bd1cSJunchao Zhang     #endif
539afb2bd1cSJunchao Zhang 
540aa372e3fSPaul Mullowney       /* perform the solve analysis */
541aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
542aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544afb2bd1cSJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
545afb2bd1cSJunchao Zhang                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
546afb2bd1cSJunchao Zhang                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
547afb2bd1cSJunchao Zhang                              #endif
548afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
549aa372e3fSPaul Mullowney 
550aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
551aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
5522205254eSKarl Rupp 
55357d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
55457d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
55557d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
5564863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
5579ae82921SPaul Mullowney     } catch(char *ex) {
5589ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5599ae82921SPaul Mullowney     }
5609ae82921SPaul Mullowney   }
5619ae82921SPaul Mullowney   PetscFunctionReturn(0);
5629ae82921SPaul Mullowney }
5639ae82921SPaul Mullowney 
564087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
5659ae82921SPaul Mullowney {
5669ae82921SPaul Mullowney   PetscErrorCode               ierr;
5679ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
5689ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5699ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
5709ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
5719ae82921SPaul Mullowney   const PetscInt               *r,*c;
5729ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
5739ae82921SPaul Mullowney 
5749ae82921SPaul Mullowney   PetscFunctionBegin;
575ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
576087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
577087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
5782205254eSKarl Rupp 
579e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
580aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
5819ae82921SPaul Mullowney 
582c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
583e057df02SPaul Mullowney   /* lower triangular indices */
5849ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5859ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5862205254eSKarl Rupp   if (!row_identity) {
587aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
588aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
5892205254eSKarl Rupp   }
5909ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5919ae82921SPaul Mullowney 
592e057df02SPaul Mullowney   /* upper triangular indices */
5939ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
5949ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5952205254eSKarl Rupp   if (!col_identity) {
596aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
597aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
5982205254eSKarl Rupp   }
5994863603aSSatish Balay 
6004863603aSSatish Balay   if (!row_identity && !col_identity) {
6014863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
6024863603aSSatish Balay   } else if(!row_identity) {
6034863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
6044863603aSSatish Balay   } else if(!col_identity) {
6054863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
6064863603aSSatish Balay   }
6074863603aSSatish Balay 
6089ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
6099ae82921SPaul Mullowney   PetscFunctionReturn(0);
6109ae82921SPaul Mullowney }
6119ae82921SPaul Mullowney 
612087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
613087f3262SPaul Mullowney {
614087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
615087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
616aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
617aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
618087f3262SPaul Mullowney   cusparseStatus_t                  stat;
619087f3262SPaul Mullowney   PetscErrorCode                    ierr;
62057d48284SJunchao Zhang   cudaError_t                       cerr;
621087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
622087f3262SPaul Mullowney   PetscScalar                       *AAUp;
623087f3262SPaul Mullowney   PetscScalar                       *AALo;
624087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
625087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
626087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
627087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
628087f3262SPaul Mullowney 
629087f3262SPaul Mullowney   PetscFunctionBegin;
630cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
631c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
632087f3262SPaul Mullowney     try {
633087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
63457d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
63557d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
63657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
63757d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
638087f3262SPaul Mullowney 
639087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
640087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
641087f3262SPaul Mullowney       AiUp[n]=nzUpper;
642087f3262SPaul Mullowney       offset = 0;
643087f3262SPaul Mullowney       for (i=0; i<n; i++) {
644087f3262SPaul Mullowney         /* set the pointers */
645087f3262SPaul Mullowney         v  = aa + ai[i];
646087f3262SPaul Mullowney         vj = aj + ai[i];
647087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
648087f3262SPaul Mullowney 
649087f3262SPaul Mullowney         /* first, set the diagonal elements */
650087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
65109f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1.0/v[nz];
652087f3262SPaul Mullowney         AiUp[i]      = offset;
65309f51544SAlejandro Lamas Daviña         AALo[offset] = (MatScalar)1.0/v[nz];
654087f3262SPaul Mullowney 
655087f3262SPaul Mullowney         offset+=1;
656087f3262SPaul Mullowney         if (nz>0) {
657f22e0265SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
658580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
659087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
660087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
661087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
662087f3262SPaul Mullowney           }
663087f3262SPaul Mullowney           offset+=nz;
664087f3262SPaul Mullowney         }
665087f3262SPaul Mullowney       }
666087f3262SPaul Mullowney 
667aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
668aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
669087f3262SPaul Mullowney 
670aa372e3fSPaul Mullowney       /* Create the matrix description */
67157d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
67257d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
673afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
674afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
675afb2bd1cSJunchao Zhang      #else
67657d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
677afb2bd1cSJunchao Zhang      #endif
67857d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
67957d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
680087f3262SPaul Mullowney 
681aa372e3fSPaul Mullowney       /* set the matrix */
682aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
683aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
684aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
685aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
686aa372e3fSPaul Mullowney 
687aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
688aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
689aa372e3fSPaul Mullowney 
690aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
691aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
692aa372e3fSPaul Mullowney 
693aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
694aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
695aa372e3fSPaul Mullowney 
696afb2bd1cSJunchao Zhang       /* set the operation */
697afb2bd1cSJunchao Zhang       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
698afb2bd1cSJunchao Zhang 
699afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
700afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
701afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
702afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
703afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
704afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
705afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
706afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
707afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
708afb2bd1cSJunchao Zhang     #endif
709afb2bd1cSJunchao Zhang 
710aa372e3fSPaul Mullowney       /* perform the solve analysis */
711aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
712aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714afb2bd1cSJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
715afb2bd1cSJunchao Zhang                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
716afb2bd1cSJunchao Zhang                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
717afb2bd1cSJunchao Zhang                               #endif
718afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
719aa372e3fSPaul Mullowney 
720aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
721aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
722aa372e3fSPaul Mullowney 
723aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
724aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
725aa372e3fSPaul Mullowney 
726aa372e3fSPaul Mullowney       /* Create the matrix description */
72757d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
72857d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
729afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
730afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
731afb2bd1cSJunchao Zhang      #else
73257d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
733afb2bd1cSJunchao Zhang      #endif
73457d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
73557d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
736aa372e3fSPaul Mullowney 
737aa372e3fSPaul Mullowney       /* set the operation */
738aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
739aa372e3fSPaul Mullowney 
740aa372e3fSPaul Mullowney       /* set the matrix */
741aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
742aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
743aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
744aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
745aa372e3fSPaul Mullowney 
746aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
748aa372e3fSPaul Mullowney 
749aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
751aa372e3fSPaul Mullowney 
752aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
7544863603aSSatish Balay       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
755aa372e3fSPaul Mullowney 
756afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
757afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
758afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
759afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
760afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
761afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
762afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
763afb2bd1cSJunchao Zhang                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
764afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
765afb2bd1cSJunchao Zhang     #endif
766afb2bd1cSJunchao Zhang 
767aa372e3fSPaul Mullowney       /* perform the solve analysis */
768aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
769aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771afb2bd1cSJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
772afb2bd1cSJunchao Zhang                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
773afb2bd1cSJunchao Zhang                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
774afb2bd1cSJunchao Zhang                               #endif
775afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
776aa372e3fSPaul Mullowney 
777aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
778aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
779087f3262SPaul Mullowney 
780c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
78157d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
78257d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
78357d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
78457d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
785087f3262SPaul Mullowney     } catch(char *ex) {
786087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
787087f3262SPaul Mullowney     }
788087f3262SPaul Mullowney   }
789087f3262SPaul Mullowney   PetscFunctionReturn(0);
790087f3262SPaul Mullowney }
791087f3262SPaul Mullowney 
792087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
7939ae82921SPaul Mullowney {
7949ae82921SPaul Mullowney   PetscErrorCode               ierr;
795087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
796087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
797087f3262SPaul Mullowney   IS                           ip = a->row;
798087f3262SPaul Mullowney   const PetscInt               *rip;
799087f3262SPaul Mullowney   PetscBool                    perm_identity;
800087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
801087f3262SPaul Mullowney 
802087f3262SPaul Mullowney   PetscFunctionBegin;
803ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
804087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
805e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
806aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
807aa372e3fSPaul Mullowney 
808087f3262SPaul Mullowney   /* lower triangular indices */
809087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
810087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
811087f3262SPaul Mullowney   if (!perm_identity) {
8124e4bbfaaSStefano Zampini     IS             iip;
8134e4bbfaaSStefano Zampini     const PetscInt *irip;
8144e4bbfaaSStefano Zampini 
8154e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
8164e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
817aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
818aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
819aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
8204e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
8214e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
8224e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
8234863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
824087f3262SPaul Mullowney   }
825087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
826087f3262SPaul Mullowney   PetscFunctionReturn(0);
827087f3262SPaul Mullowney }
828087f3262SPaul Mullowney 
8296fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
8309ae82921SPaul Mullowney {
8319ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
8329ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
8339ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
834b175d8bbSPaul Mullowney   PetscErrorCode ierr;
8359ae82921SPaul Mullowney 
8369ae82921SPaul Mullowney   PetscFunctionBegin;
8379ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
838ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
839e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8409ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
8419ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
842bda325fcSPaul Mullowney   if (row_identity && col_identity) {
843bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
844bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8454e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8464e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
847bda325fcSPaul Mullowney   } else {
848bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
849bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
8504e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8514e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
852bda325fcSPaul Mullowney   }
8538dc1d2a3SPaul Mullowney 
854e057df02SPaul Mullowney   /* get the triangular factors */
855087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
8569ae82921SPaul Mullowney   PetscFunctionReturn(0);
8579ae82921SPaul Mullowney }
8589ae82921SPaul Mullowney 
859087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
860087f3262SPaul Mullowney {
861087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
862087f3262SPaul Mullowney   IS             ip = b->row;
863087f3262SPaul Mullowney   PetscBool      perm_identity;
864b175d8bbSPaul Mullowney   PetscErrorCode ierr;
865087f3262SPaul Mullowney 
866087f3262SPaul Mullowney   PetscFunctionBegin;
867087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
868ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
869087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
870087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
871087f3262SPaul Mullowney   if (perm_identity) {
872087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
873087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8744e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8754e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
876087f3262SPaul Mullowney   } else {
877087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
878087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
8794e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8804e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
881087f3262SPaul Mullowney   }
882087f3262SPaul Mullowney 
883087f3262SPaul Mullowney   /* get the triangular factors */
884087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
885087f3262SPaul Mullowney   PetscFunctionReturn(0);
886087f3262SPaul Mullowney }
8879ae82921SPaul Mullowney 
888b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
889bda325fcSPaul Mullowney {
890bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
892aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
893aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
894aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
895bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
896aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
897aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
898aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
899aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
900b175d8bbSPaul Mullowney 
901bda325fcSPaul Mullowney   PetscFunctionBegin;
902bda325fcSPaul Mullowney 
903aa372e3fSPaul Mullowney   /*********************************************/
904aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
905aa372e3fSPaul Mullowney   /*********************************************/
906aa372e3fSPaul Mullowney 
907aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
908aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
909aa372e3fSPaul Mullowney 
910aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
911aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
912aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
913aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
914aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
915aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
916aa372e3fSPaul Mullowney 
917aa372e3fSPaul Mullowney   /* Create the matrix description */
91857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
91957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
92057d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
92157d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
92257d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
923aa372e3fSPaul Mullowney 
924aa372e3fSPaul Mullowney   /* set the operation */
925aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
926aa372e3fSPaul Mullowney 
927aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
928aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
929afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
930afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
931aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
932afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
933afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
934afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
935aa372e3fSPaul Mullowney 
936aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
937afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
938afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
939afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
940afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
941afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
942afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
943afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
944afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
945afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
946afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
947afb2bd1cSJunchao Zhang   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
948afb2bd1cSJunchao Zhang #endif
949afb2bd1cSJunchao Zhang 
950aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
951aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
952aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
953aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
954aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
955aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
956afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
957afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
958afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
959afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
960afb2bd1cSJunchao Zhang                         #else
961afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
962afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
963afb2bd1cSJunchao Zhang                         #endif
964afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
965aa372e3fSPaul Mullowney 
966afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
967afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
968afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
969afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
970afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
971afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
972afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
973afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
974afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
975afb2bd1cSJunchao Zhang #endif
976afb2bd1cSJunchao Zhang 
977afb2bd1cSJunchao Zhang   /* perform the solve analysis */
978aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
979afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
982afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
983afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
984afb2bd1cSJunchao Zhang                           #endif
985afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
986aa372e3fSPaul Mullowney 
987aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
988aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
989aa372e3fSPaul Mullowney 
990aa372e3fSPaul Mullowney   /*********************************************/
991aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
992aa372e3fSPaul Mullowney   /*********************************************/
993aa372e3fSPaul Mullowney 
994aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
995aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
996aa372e3fSPaul Mullowney 
997aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
998aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
999aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1000aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1003aa372e3fSPaul Mullowney 
1004aa372e3fSPaul Mullowney   /* Create the matrix description */
100557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
100657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
100757d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
100857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
100957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1010aa372e3fSPaul Mullowney 
1011aa372e3fSPaul Mullowney   /* set the operation */
1012aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1013aa372e3fSPaul Mullowney 
1014aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1015aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1016afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1017afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1018aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1019afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1020afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1021afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1022aa372e3fSPaul Mullowney 
1023aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1024afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1026afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1027afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1028afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1029afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1030afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1031afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1033afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1034afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035afb2bd1cSJunchao Zhang #endif
1036afb2bd1cSJunchao Zhang 
1037aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1038aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1039aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1040aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1041aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1042aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1043afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1044afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1046afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1047afb2bd1cSJunchao Zhang                         #else
1048afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1049afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1050afb2bd1cSJunchao Zhang                         #endif
1051afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1052aa372e3fSPaul Mullowney 
1053afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1054afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1055afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1056afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1057afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1058afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1059afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1060afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1061afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1062afb2bd1cSJunchao Zhang   #endif
1063afb2bd1cSJunchao Zhang 
1064afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1065aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1069afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1071afb2bd1cSJunchao Zhang                           #endif
1072afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1073aa372e3fSPaul Mullowney 
1074aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
1075aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1076bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1077bda325fcSPaul Mullowney }
1078bda325fcSPaul Mullowney 
1079b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1080bda325fcSPaul Mullowney {
1081aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1082aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1083aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1084bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1085bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1086aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1087b06137fdSPaul Mullowney   cudaError_t                  err;
108885ba7357SStefano Zampini   PetscErrorCode               ierr;
1089b175d8bbSPaul Mullowney 
1090bda325fcSPaul Mullowney   PetscFunctionBegin;
109185ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
109285ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
109385ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
109485ba7357SStefano Zampini   /* create cusparse matrix */
1095aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
109657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1097aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
109857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
109957d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1100aa372e3fSPaul Mullowney 
1101b06137fdSPaul Mullowney   /* set alpha and beta */
1102afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
11037656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
11047656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1105afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
11067656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
11077656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
110857d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1109b06137fdSPaul Mullowney 
1110aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1111aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1112aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1113554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1114554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1115aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1116a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1117aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1118aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1119a3fdcf43SKarl Rupp 
112081902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
112181902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1122afb2bd1cSJunchao Zhang 
112381902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1124afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1126afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1127afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1128afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1129afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1130afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1131afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1132afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1133afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1135afb2bd1cSJunchao Zhang    #endif
1136afb2bd1cSJunchao Zhang 
1137a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1138a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1139aa372e3fSPaul Mullowney                             matrix->values->data().get(),
114081902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1141aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1142aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1143afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1145afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1146afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1147afb2bd1cSJunchao Zhang                           #else
1148afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1149afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1150afb2bd1cSJunchao Zhang                           #endif
1151afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
1152aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1153afb2bd1cSJunchao Zhang 
1154afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1155afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1156afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1157afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1158afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1159afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1160afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1161afb2bd1cSJunchao Zhang    #endif
1162aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1163afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1165afb2bd1cSJunchao Zhang    #else
1166aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
116751c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
116851c6d536SStefano Zampini     /* First convert HYB to CSR */
1169aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1170aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1171aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1172aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1173aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1174aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1175aa372e3fSPaul Mullowney 
1176aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1177aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1178aa372e3fSPaul Mullowney                             temp->values->data().get(),
1179aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
118057d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1181aa372e3fSPaul Mullowney 
1182aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1183aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1184aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1185aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1186aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1187aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1188aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1189aa372e3fSPaul Mullowney 
1190aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1191aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1192aa372e3fSPaul Mullowney                             temp->values->data().get(),
1193aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1194aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1195aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1196aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1197aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
119857d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1199aa372e3fSPaul Mullowney 
1200aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1201aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
120257d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1203aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1204aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1205aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1206aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1207aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1208aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
120957d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1210aa372e3fSPaul Mullowney 
1211aa372e3fSPaul Mullowney     /* assign the pointer */
1212aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1213aa372e3fSPaul Mullowney     /* delete temporaries */
1214aa372e3fSPaul Mullowney     if (tempT) {
1215aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1216aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1217aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1218aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1219087f3262SPaul Mullowney     }
1220aa372e3fSPaul Mullowney     if (temp) {
1221aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1222aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1223aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1224aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1225aa372e3fSPaul Mullowney     }
1226afb2bd1cSJunchao Zhang    #endif
1227aa372e3fSPaul Mullowney   }
122805035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
122985ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
123085ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1231213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1232213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1233aa372e3fSPaul Mullowney   /* assign the pointer */
1234aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1235bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1236bda325fcSPaul Mullowney }
1237bda325fcSPaul Mullowney 
12384e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
12396fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1240bda325fcSPaul Mullowney {
1241c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1242465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1243465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1244465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1245465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1246bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1247bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1248aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1249aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1250aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1251b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
125257d48284SJunchao Zhang   cudaError_t                           cerr;
1253bda325fcSPaul Mullowney 
1254bda325fcSPaul Mullowney   PetscFunctionBegin;
1255aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1256aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1257bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1258aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1259aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1260bda325fcSPaul Mullowney   }
1261bda325fcSPaul Mullowney 
1262bda325fcSPaul Mullowney   /* Get the GPU pointers */
1263c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1264c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1265c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1266c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1267bda325fcSPaul Mullowney 
12687a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1269aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1270c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1271c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1272c41cb2e2SAlejandro Lamas Daviña                xGPU);
1273aa372e3fSPaul Mullowney 
1274aa372e3fSPaul Mullowney   /* First, solve U */
1275aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1276afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
1277afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1278afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1279afb2bd1cSJunchao Zhang                       #endif
1280afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1281aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1282aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1283aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1284aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1285afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
1286afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1288afb2bd1cSJunchao Zhang                       #endif
1289afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1290aa372e3fSPaul Mullowney 
1291aa372e3fSPaul Mullowney   /* Then, solve L */
1292aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1293afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
1294afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1295afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1296afb2bd1cSJunchao Zhang                       #endif
1297afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1298aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1299aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1300aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1301aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1302afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1303afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1305afb2bd1cSJunchao Zhang                       #endif
1306afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1307aa372e3fSPaul Mullowney 
1308aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1309c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1310c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1311aa372e3fSPaul Mullowney                tempGPU->begin());
1312aa372e3fSPaul Mullowney 
1313aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1314c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1315bda325fcSPaul Mullowney 
1316bda325fcSPaul Mullowney   /* restore */
1317c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1318c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
131905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1320661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1321958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1322bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1323bda325fcSPaul Mullowney }
1324bda325fcSPaul Mullowney 
13256fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1326bda325fcSPaul Mullowney {
1327465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1328465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1329bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1330bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1331aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1332aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1333aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1334b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
133557d48284SJunchao Zhang   cudaError_t                       cerr;
1336bda325fcSPaul Mullowney 
1337bda325fcSPaul Mullowney   PetscFunctionBegin;
1338aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1339aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1340bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1341aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1342aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1343bda325fcSPaul Mullowney   }
1344bda325fcSPaul Mullowney 
1345bda325fcSPaul Mullowney   /* Get the GPU pointers */
1346c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1347c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1348bda325fcSPaul Mullowney 
13497a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1350aa372e3fSPaul Mullowney   /* First, solve U */
1351aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1352afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
1353afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1354afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1355afb2bd1cSJunchao Zhang                       #endif
1356afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1357aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1358aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1359aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1360aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1361afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
1362afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1364afb2bd1cSJunchao Zhang                       #endif
1365afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1366aa372e3fSPaul Mullowney 
1367aa372e3fSPaul Mullowney   /* Then, solve L */
1368aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1369afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
1370afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1372afb2bd1cSJunchao Zhang                       #endif
1373afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1374aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1375aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1376aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1377aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1378afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1379afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1381afb2bd1cSJunchao Zhang                       #endif
1382afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1383bda325fcSPaul Mullowney 
1384bda325fcSPaul Mullowney   /* restore */
1385c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1386c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
138705035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1388661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1389958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1390bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1391bda325fcSPaul Mullowney }
1392bda325fcSPaul Mullowney 
13936fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
13949ae82921SPaul Mullowney {
1395465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1396465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1397465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1398465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
13999ae82921SPaul Mullowney   cusparseStatus_t                      stat;
14009ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1401aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1402aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1403aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1404b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
140557d48284SJunchao Zhang   cudaError_t                           cerr;
14069ae82921SPaul Mullowney 
14079ae82921SPaul Mullowney   PetscFunctionBegin;
1408ebc8f436SDominic Meiser 
1409e057df02SPaul Mullowney   /* Get the GPU pointers */
1410c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1411c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1412c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1413c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
14149ae82921SPaul Mullowney 
14157a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1416aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1417c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1418c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
14194e4bbfaaSStefano Zampini                tempGPU->begin());
1420aa372e3fSPaul Mullowney 
1421aa372e3fSPaul Mullowney   /* Next, solve L */
1422aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1423afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
1424afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1425afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1426afb2bd1cSJunchao Zhang                       #endif
1427afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1428aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1429aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1430aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1431aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1432afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1433afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1435afb2bd1cSJunchao Zhang                       #endif
1436afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1437aa372e3fSPaul Mullowney 
1438aa372e3fSPaul Mullowney   /* Then, solve U */
1439aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1440afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
1441afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1442afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1443afb2bd1cSJunchao Zhang                       #endif
1444afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1445aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1446aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1447aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1448aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1449afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
1450afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1452afb2bd1cSJunchao Zhang                       #endif
1453afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1454aa372e3fSPaul Mullowney 
14554e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
14564e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
14574e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
14584e4bbfaaSStefano Zampini                xGPU);
14599ae82921SPaul Mullowney 
1460c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1461c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
146205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1463661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1464958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
14659ae82921SPaul Mullowney   PetscFunctionReturn(0);
14669ae82921SPaul Mullowney }
14679ae82921SPaul Mullowney 
14686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
14699ae82921SPaul Mullowney {
1470465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1471465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
14729ae82921SPaul Mullowney   cusparseStatus_t                  stat;
14739ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1474aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1475aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1476aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1477b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
147857d48284SJunchao Zhang   cudaError_t                       cerr;
14799ae82921SPaul Mullowney 
14809ae82921SPaul Mullowney   PetscFunctionBegin;
1481e057df02SPaul Mullowney   /* Get the GPU pointers */
1482c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1483c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
14849ae82921SPaul Mullowney 
14857a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1486aa372e3fSPaul Mullowney   /* First, solve L */
1487aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1488afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
1489afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1490afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1491afb2bd1cSJunchao Zhang                       #endif
1492afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1493aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1494aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1495aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1496aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1497afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
1498afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1500afb2bd1cSJunchao Zhang                       #endif
1501afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1502aa372e3fSPaul Mullowney 
1503aa372e3fSPaul Mullowney   /* Next, solve U */
1504aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1505afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
1506afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1507afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1508afb2bd1cSJunchao Zhang                       #endif
1509afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1510aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1511aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1512aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1513aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1514afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1515afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1517afb2bd1cSJunchao Zhang                       #endif
1518afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
15199ae82921SPaul Mullowney 
1520c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1521c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
152205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1523661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1524958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15259ae82921SPaul Mullowney   PetscFunctionReturn(0);
15269ae82921SPaul Mullowney }
15279ae82921SPaul Mullowney 
15286fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
15299ae82921SPaul Mullowney {
1530aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
15317c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
15329ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1533213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
15349ae82921SPaul Mullowney   PetscErrorCode               ierr;
1535aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1536b06137fdSPaul Mullowney   cudaError_t                  err;
15379ae82921SPaul Mullowney 
15389ae82921SPaul Mullowney   PetscFunctionBegin;
153995639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1540c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
154181902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
154281902715SJunchao Zhang       /* Copy values only */
1543afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1544afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
154585ba7357SStefano Zampini 
154685ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1547afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
154805035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
15494863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
155085ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
155181902715SJunchao Zhang 
155281902715SJunchao Zhang       /* Update matT when it was built before */
155381902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
155481902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1555afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
155685ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
155781902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1558afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1559afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
156081902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1561afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1562afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1563afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1564afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1565afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1566afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1567afb2bd1cSJunchao Zhang                           #else
1568afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1569afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1570afb2bd1cSJunchao Zhang                           #endif
1571afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
157205035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
157385ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
157481902715SJunchao Zhang       }
157534d6c7a5SJose E. Roman     } else {
157685ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
15777c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
15787c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
15797c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
158081902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
15819ae82921SPaul Mullowney       try {
15829ae82921SPaul Mullowney         if (a->compressedrow.use) {
15839ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
15849ae82921SPaul Mullowney           ii   = a->compressedrow.i;
15859ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
15869ae82921SPaul Mullowney         } else {
1587213423ffSJunchao Zhang           m    = A->rmap->n;
1588213423ffSJunchao Zhang           ii   = a->i;
1589e6e9a74fSStefano Zampini           ridx = NULL;
15909ae82921SPaul Mullowney         }
1591213423ffSJunchao Zhang         cusparsestruct->nrows = m;
15929ae82921SPaul Mullowney 
159385ba7357SStefano Zampini         /* create cusparse matrix */
1594aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
159557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
159657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
159757d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
15989ae82921SPaul Mullowney 
1599afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),    sizeof(PetscScalar));CHKERRCUDA(err);
16007656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
16017656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1602afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
16037656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
16047656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
160557d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1606b06137fdSPaul Mullowney 
1607aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1608aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1609aa372e3fSPaul Mullowney           /* set the matrix */
1610afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1611afb2bd1cSJunchao Zhang           mat->num_rows = m;
1612afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1613afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1614afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1615afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
16169ae82921SPaul Mullowney 
1617afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1618afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1619aa372e3fSPaul Mullowney 
1620afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1621afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1622aa372e3fSPaul Mullowney 
1623aa372e3fSPaul Mullowney           /* assign the pointer */
1624afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1625afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1626afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1627afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1628afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1629afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1630afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1631afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1632afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1633afb2bd1cSJunchao Zhang           }
1634afb2bd1cSJunchao Zhang          #endif
1635aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1636afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1637afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1638afb2bd1cSJunchao Zhang          #else
1639afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1640afb2bd1cSJunchao Zhang           mat->num_rows = m;
1641afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1642afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1643afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1644afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1645aa372e3fSPaul Mullowney 
1646afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1647afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1648aa372e3fSPaul Mullowney 
1649afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1650afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1651aa372e3fSPaul Mullowney 
1652aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
165357d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1654aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1655aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1656afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1657afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1658afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1659afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
166057d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1661aa372e3fSPaul Mullowney           /* assign the pointer */
1662aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1663aa372e3fSPaul Mullowney 
1664afb2bd1cSJunchao Zhang           if (mat) {
1665afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1666afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1667afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1668afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1669087f3262SPaul Mullowney           }
1670afb2bd1cSJunchao Zhang          #endif
1671087f3262SPaul Mullowney         }
1672ca45077fSPaul Mullowney 
1673aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1674213423ffSJunchao Zhang         if (a->compressedrow.use) {
1675213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1676aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1677aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1678213423ffSJunchao Zhang           tmp = m;
1679213423ffSJunchao Zhang         } else {
1680213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1681213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1682213423ffSJunchao Zhang           tmp = 0;
1683213423ffSJunchao Zhang         }
1684213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1685aa372e3fSPaul Mullowney 
1686aa372e3fSPaul Mullowney         /* assign the pointer */
1687aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
16889ae82921SPaul Mullowney       } catch(char *ex) {
16899ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
16909ae82921SPaul Mullowney       }
169105035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
169285ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
169334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
169434d6c7a5SJose E. Roman     }
1695c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
16969ae82921SPaul Mullowney   }
16979ae82921SPaul Mullowney   PetscFunctionReturn(0);
16989ae82921SPaul Mullowney }
16999ae82921SPaul Mullowney 
1700c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1701aa372e3fSPaul Mullowney {
1702aa372e3fSPaul Mullowney   template <typename Tuple>
1703aa372e3fSPaul Mullowney   __host__ __device__
1704aa372e3fSPaul Mullowney   void operator()(Tuple t)
1705aa372e3fSPaul Mullowney   {
1706aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1707aa372e3fSPaul Mullowney   }
1708aa372e3fSPaul Mullowney };
1709aa372e3fSPaul Mullowney 
1710e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1711e6e9a74fSStefano Zampini {
1712e6e9a74fSStefano Zampini   template <typename Tuple>
1713e6e9a74fSStefano Zampini   __host__ __device__
1714e6e9a74fSStefano Zampini   void operator()(Tuple t)
1715e6e9a74fSStefano Zampini   {
1716e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1717e6e9a74fSStefano Zampini   }
1718e6e9a74fSStefano Zampini };
1719e6e9a74fSStefano Zampini 
1720afb2bd1cSJunchao Zhang struct MatMatCusparse {
1721ccdfe979SStefano Zampini   PetscBool            cisdense;
1722ccdfe979SStefano Zampini   PetscScalar          *Bt;
1723ccdfe979SStefano Zampini   Mat                  X;
1724afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1725afb2bd1cSJunchao Zhang   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1726afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
1727afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
1728afb2bd1cSJunchao Zhang   size_t               spmmBufferSize;
1729afb2bd1cSJunchao Zhang   void                 *spmmBuffer;
1730afb2bd1cSJunchao Zhang   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1731afb2bd1cSJunchao Zhang #endif
1732afb2bd1cSJunchao Zhang };
1733ccdfe979SStefano Zampini 
1734ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1735ccdfe979SStefano Zampini {
1736ccdfe979SStefano Zampini   PetscErrorCode ierr;
1737ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1738ccdfe979SStefano Zampini   cudaError_t    cerr;
1739ccdfe979SStefano Zampini 
1740ccdfe979SStefano Zampini   PetscFunctionBegin;
1741ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1742afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1743afb2bd1cSJunchao Zhang   cusparseStatus_t stat;
1744afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1745afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1746afb2bd1cSJunchao Zhang   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1747afb2bd1cSJunchao Zhang  #endif
1748ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1749ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1750ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1751ccdfe979SStefano Zampini }
1752ccdfe979SStefano Zampini 
1753ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1754ccdfe979SStefano Zampini 
1755ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1756ccdfe979SStefano Zampini {
1757ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1758ccdfe979SStefano Zampini   Mat                          A,B;
1759afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1760ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1761ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1762ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1763ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1764ccdfe979SStefano Zampini   const PetscScalar            *barray;
1765ccdfe979SStefano Zampini   PetscScalar                  *carray;
1766ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1767ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1768ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1769ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1770afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1771ccdfe979SStefano Zampini 
1772ccdfe979SStefano Zampini   PetscFunctionBegin;
1773ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1774ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1775ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1776ccdfe979SStefano Zampini   A    = product->A;
1777ccdfe979SStefano Zampini   B    = product->B;
1778ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1779ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1780ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1781ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1782ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1783ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1784ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1785ccdfe979SStefano Zampini   switch (product->type) {
1786ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1787ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1788ccdfe979SStefano Zampini     mat = cusp->mat;
1789ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1790ccdfe979SStefano Zampini     m   = A->rmap->n;
1791ccdfe979SStefano Zampini     n   = B->cmap->n;
1792ccdfe979SStefano Zampini     break;
1793ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1794e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1795e6e9a74fSStefano Zampini       mat = cusp->mat;
1796e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1797e6e9a74fSStefano Zampini     } else {
1798ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1799ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1800ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1801e6e9a74fSStefano Zampini     }
1802ccdfe979SStefano Zampini     m = A->cmap->n;
1803ccdfe979SStefano Zampini     n = B->cmap->n;
1804ccdfe979SStefano Zampini     break;
1805ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1806ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1807ccdfe979SStefano Zampini     mat = cusp->mat;
1808ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1809ccdfe979SStefano Zampini     m   = A->rmap->n;
1810ccdfe979SStefano Zampini     n   = B->rmap->n;
1811ccdfe979SStefano Zampini     break;
1812ccdfe979SStefano Zampini   default:
1813ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1814ccdfe979SStefano Zampini   }
1815ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1816ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1817ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1818ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1819afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1820ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1821afb2bd1cSJunchao Zhang 
1822ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1823c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1824c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1825c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1826c8378d12SStefano Zampini   } else {
1827c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1828c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1829c8378d12SStefano Zampini   }
1830c8378d12SStefano Zampini 
1831c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1832afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1833afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1834afb2bd1cSJunchao Zhang   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1835afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1836afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1837afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
1838afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1839afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
1840afb2bd1cSJunchao Zhang     }
1841c8378d12SStefano Zampini 
1842afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1843afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1844afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1845afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
1846afb2bd1cSJunchao Zhang     }
1847afb2bd1cSJunchao Zhang 
1848afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
1849afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
1850afb2bd1cSJunchao Zhang                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1851afb2bd1cSJunchao Zhang                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1852afb2bd1cSJunchao Zhang                               csrmat->values->data().get(),
1853afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1854afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1855afb2bd1cSJunchao Zhang     }
1856afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1857afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1858afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
1859afb2bd1cSJunchao Zhang                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1860afb2bd1cSJunchao Zhang     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1861afb2bd1cSJunchao Zhang     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1862afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
1863afb2bd1cSJunchao Zhang   } else {
1864afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
1865afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1866afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1867afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1868afb2bd1cSJunchao Zhang   }
1869afb2bd1cSJunchao Zhang 
1870afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
1871afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1872afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1873afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
1874afb2bd1cSJunchao Zhang                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1875afb2bd1cSJunchao Zhang  #else
1876afb2bd1cSJunchao Zhang   PetscInt k;
1877afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
1878ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1879ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
1880ccdfe979SStefano Zampini     cublasStatus_t cerr;
1881ccdfe979SStefano Zampini 
1882ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1883ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1884ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
1885ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
1886ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
1887ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1888ccdfe979SStefano Zampini     blda = B->cmap->n;
1889afb2bd1cSJunchao Zhang     k    = B->cmap->n;
1890afb2bd1cSJunchao Zhang   } else {
1891afb2bd1cSJunchao Zhang     k    = B->rmap->n;
1892ccdfe979SStefano Zampini   }
1893ccdfe979SStefano Zampini 
1894afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1895ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1896afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
1897ccdfe979SStefano Zampini                            csrmat->values->data().get(),
1898ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
1899ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
1900ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1901ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
1902afb2bd1cSJunchao Zhang  #endif
1903afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1904c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1905c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1906ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1907ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
1908ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1909ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1910ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
1911ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1912ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1913ccdfe979SStefano Zampini   } else {
1914ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1915ccdfe979SStefano Zampini   }
1916ccdfe979SStefano Zampini   if (mmdata->cisdense) {
1917ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1918ccdfe979SStefano Zampini   }
1919ccdfe979SStefano Zampini   if (!biscuda) {
1920ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1921ccdfe979SStefano Zampini   }
1922ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1923ccdfe979SStefano Zampini }
1924ccdfe979SStefano Zampini 
1925ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1926ccdfe979SStefano Zampini {
1927ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
1928ccdfe979SStefano Zampini   Mat                A,B;
1929ccdfe979SStefano Zampini   PetscInt           m,n;
1930ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
1931ccdfe979SStefano Zampini   PetscErrorCode     ierr;
1932ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
1933ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
1934ccdfe979SStefano Zampini 
1935ccdfe979SStefano Zampini   PetscFunctionBegin;
1936ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1937ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1938ccdfe979SStefano Zampini   A    = product->A;
1939ccdfe979SStefano Zampini   B    = product->B;
1940ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1941ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1942ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1943ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1944ccdfe979SStefano Zampini   switch (product->type) {
1945ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1946ccdfe979SStefano Zampini     m = A->rmap->n;
1947ccdfe979SStefano Zampini     n = B->cmap->n;
1948ccdfe979SStefano Zampini     break;
1949ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1950ccdfe979SStefano Zampini     m = A->cmap->n;
1951ccdfe979SStefano Zampini     n = B->cmap->n;
1952ccdfe979SStefano Zampini     break;
1953ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1954ccdfe979SStefano Zampini     m = A->rmap->n;
1955ccdfe979SStefano Zampini     n = B->rmap->n;
1956ccdfe979SStefano Zampini     break;
1957ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1958ccdfe979SStefano Zampini     m = B->cmap->n;
1959ccdfe979SStefano Zampini     n = B->cmap->n;
1960ccdfe979SStefano Zampini     break;
1961ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1962ccdfe979SStefano Zampini     m = B->rmap->n;
1963ccdfe979SStefano Zampini     n = B->rmap->n;
1964ccdfe979SStefano Zampini     break;
1965ccdfe979SStefano Zampini   default:
1966ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1967ccdfe979SStefano Zampini   }
1968ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
1969ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
1970ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
1971ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
1972ccdfe979SStefano Zampini 
1973ccdfe979SStefano Zampini   /* product data */
1974ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
1975ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
1976afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
1977afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
1978ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1979afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
1980ccdfe979SStefano Zampini   }
1981afb2bd1cSJunchao Zhang  #endif
1982ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
1983ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1984ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
1985ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
1986ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
1987ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
1988ccdfe979SStefano Zampini     } else {
1989ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
1990ccdfe979SStefano Zampini     }
1991ccdfe979SStefano Zampini   }
1992ccdfe979SStefano Zampini   C->product->data    = mmdata;
1993ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
1994ccdfe979SStefano Zampini 
1995ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
1996ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1997ccdfe979SStefano Zampini }
1998ccdfe979SStefano Zampini 
1999ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2000ccdfe979SStefano Zampini 
2001ccdfe979SStefano Zampini /* handles dense B */
2002ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2003ccdfe979SStefano Zampini {
2004ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
2005ccdfe979SStefano Zampini   PetscErrorCode ierr;
2006ccdfe979SStefano Zampini 
2007ccdfe979SStefano Zampini   PetscFunctionBegin;
2008ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2009ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2010ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
2011ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2012ccdfe979SStefano Zampini     PetscFunctionReturn(0);
2013ccdfe979SStefano Zampini   }
2014ccdfe979SStefano Zampini   switch (product->type) {
2015ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2016ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2017ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2018ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2019ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2020ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2021ccdfe979SStefano Zampini   default:
2022ccdfe979SStefano Zampini     break;
2023ccdfe979SStefano Zampini   }
2024ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2025ccdfe979SStefano Zampini }
2026ccdfe979SStefano Zampini 
20276fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
20289ae82921SPaul Mullowney {
2029b175d8bbSPaul Mullowney   PetscErrorCode ierr;
20309ae82921SPaul Mullowney 
20319ae82921SPaul Mullowney   PetscFunctionBegin;
2032e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2033e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2034e6e9a74fSStefano Zampini }
2035e6e9a74fSStefano Zampini 
2036e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2037e6e9a74fSStefano Zampini {
2038e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2039e6e9a74fSStefano Zampini 
2040e6e9a74fSStefano Zampini   PetscFunctionBegin;
2041e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2042e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2043e6e9a74fSStefano Zampini }
2044e6e9a74fSStefano Zampini 
2045e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2046e6e9a74fSStefano Zampini {
2047e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2048e6e9a74fSStefano Zampini 
2049e6e9a74fSStefano Zampini   PetscFunctionBegin;
2050e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2051e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2052e6e9a74fSStefano Zampini }
2053e6e9a74fSStefano Zampini 
2054e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2055e6e9a74fSStefano Zampini {
2056e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2057e6e9a74fSStefano Zampini 
2058e6e9a74fSStefano Zampini   PetscFunctionBegin;
2059e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
20609ae82921SPaul Mullowney   PetscFunctionReturn(0);
20619ae82921SPaul Mullowney }
20629ae82921SPaul Mullowney 
20636fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2064ca45077fSPaul Mullowney {
2065b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2066ca45077fSPaul Mullowney 
2067ca45077fSPaul Mullowney   PetscFunctionBegin;
2068e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2069ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2070ca45077fSPaul Mullowney }
2071ca45077fSPaul Mullowney 
2072afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2073e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
20749ae82921SPaul Mullowney {
20759ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2076aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
20779ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2078e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2079b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
208057d48284SJunchao Zhang   cudaError_t                  cerr;
2081aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2082e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2083e6e9a74fSStefano Zampini   PetscBool                    compressed;
2084afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2085afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2086afb2bd1cSJunchao Zhang #endif
20876e111a19SKarl Rupp 
20889ae82921SPaul Mullowney   PetscFunctionBegin;
2089e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2090e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2091afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2092e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2093e6e9a74fSStefano Zampini   }
209434d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
209534d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2096e6e9a74fSStefano Zampini   if (!trans) {
20979ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2098e6e9a74fSStefano Zampini   } else {
2099e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2100e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2101e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2102e6e9a74fSStefano Zampini     } else {
2103afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2104e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2105e6e9a74fSStefano Zampini     }
2106e6e9a74fSStefano Zampini   }
2107e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2108e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2109213423ffSJunchao Zhang 
2110e6e9a74fSStefano Zampini   try {
2111e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2112213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2113213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2114afb2bd1cSJunchao Zhang 
211585ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2116e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2117afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2118afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2119afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2120afb2bd1cSJunchao Zhang       */
2121e6e9a74fSStefano Zampini       xptr = xarray;
2122afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2123213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2124afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2125afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2126afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2127afb2bd1cSJunchao Zhang        */
2128afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2129afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2130afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2131afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2132afb2bd1cSJunchao Zhang       }
2133afb2bd1cSJunchao Zhang      #endif
2134e6e9a74fSStefano Zampini     } else {
2135afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2136afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2137afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2138afb2bd1cSJunchao Zhang        */
2139afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2140e6e9a74fSStefano Zampini       dptr = zarray;
2141e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2142afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2143e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2144e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2145e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2146e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2147e6e9a74fSStefano Zampini       }
2148afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2149afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2150afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2151afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2152afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2153afb2bd1cSJunchao Zhang       }
2154afb2bd1cSJunchao Zhang      #endif
2155e6e9a74fSStefano Zampini     }
21569ae82921SPaul Mullowney 
2157afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2158aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2159afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2160afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2161afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2162afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2163afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2164afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2165afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2166afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2167afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2168afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2169afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2170afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2171afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2172afb2bd1cSJunchao Zhang 
2173afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2174afb2bd1cSJunchao Zhang       } else {
2175afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2176afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2177afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2178afb2bd1cSJunchao Zhang       }
2179afb2bd1cSJunchao Zhang 
2180afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2181afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2182afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2183afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2184afb2bd1cSJunchao Zhang                                beta,
2185afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2186afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2187afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2188afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2189afb2bd1cSJunchao Zhang      #else
21907656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2191e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2192a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2193afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2194aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2195e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
219657d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2197afb2bd1cSJunchao Zhang      #endif
2198aa372e3fSPaul Mullowney     } else {
2199213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2200afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2201afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2202afb2bd1cSJunchao Zhang        #else
2203301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2204e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2205afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2206e6e9a74fSStefano Zampini                                  xptr, beta,
220757d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2208afb2bd1cSJunchao Zhang        #endif
2209a65300a6SPaul Mullowney       }
2210aa372e3fSPaul Mullowney     }
221105035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2212958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2213aa372e3fSPaul Mullowney 
2214e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2215213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2216213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2217213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2218e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2219213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
22207656d835SStefano Zampini         }
2221213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2222c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
22237656d835SStefano Zampini       }
22247656d835SStefano Zampini 
2225213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2226213423ffSJunchao Zhang       if (compressed) {
2227213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2228e6e9a74fSStefano Zampini 
2229e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2230c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2231e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2232c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
223305035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2234958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2235e6e9a74fSStefano Zampini       }
2236e6e9a74fSStefano Zampini     } else {
2237e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2238e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2239e6e9a74fSStefano Zampini       }
2240e6e9a74fSStefano Zampini     }
2241e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2242213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2243213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
22449ae82921SPaul Mullowney   } catch(char *ex) {
22459ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
22469ae82921SPaul Mullowney   }
2247e6e9a74fSStefano Zampini   if (yy) {
2248958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2249e6e9a74fSStefano Zampini   } else {
2250e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2251e6e9a74fSStefano Zampini   }
22529ae82921SPaul Mullowney   PetscFunctionReturn(0);
22539ae82921SPaul Mullowney }
22549ae82921SPaul Mullowney 
22556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2256ca45077fSPaul Mullowney {
2257b175d8bbSPaul Mullowney   PetscErrorCode ierr;
22586e111a19SKarl Rupp 
2259ca45077fSPaul Mullowney   PetscFunctionBegin;
2260e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2261ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2262ca45077fSPaul Mullowney }
2263ca45077fSPaul Mullowney 
22646fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
22659ae82921SPaul Mullowney {
22669ae82921SPaul Mullowney   PetscErrorCode              ierr;
2267*3fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
2268*3fa6b06aSMark Adams   PetscBool                   is_seq = PETSC_TRUE;
2269*3fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
22709ae82921SPaul Mullowney   PetscFunctionBegin;
2271bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2272*3fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2273bc3f50f2SPaul Mullowney   }
2274*3fa6b06aSMark Adams   if (d_mat) {
2275*3fa6b06aSMark Adams     cudaError_t err;
2276*3fa6b06aSMark Adams     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
2277*3fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2278*3fa6b06aSMark Adams     nnz_state = h_mat.nonzerostate;
2279*3fa6b06aSMark Adams     is_seq = h_mat.seq;
2280*3fa6b06aSMark Adams   }
2281*3fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
2282*3fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2283*3fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
2284*3fa6b06aSMark Adams     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2285*3fa6b06aSMark Adams   } else if (nnz_state > A->nonzerostate) {
2286*3fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
2287*3fa6b06aSMark Adams   }
2288*3fa6b06aSMark Adams 
22899ae82921SPaul Mullowney   PetscFunctionReturn(0);
22909ae82921SPaul Mullowney }
22919ae82921SPaul Mullowney 
22929ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2293e057df02SPaul Mullowney /*@
22949ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2295e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2296e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2297e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2298e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2299e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
23009ae82921SPaul Mullowney 
2301d083f849SBarry Smith    Collective
23029ae82921SPaul Mullowney 
23039ae82921SPaul Mullowney    Input Parameters:
23049ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
23059ae82921SPaul Mullowney .  m - number of rows
23069ae82921SPaul Mullowney .  n - number of columns
23079ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
23089ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
23090298fd71SBarry Smith          (possibly different for each row) or NULL
23109ae82921SPaul Mullowney 
23119ae82921SPaul Mullowney    Output Parameter:
23129ae82921SPaul Mullowney .  A - the matrix
23139ae82921SPaul Mullowney 
23149ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
23159ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
23169ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
23179ae82921SPaul Mullowney 
23189ae82921SPaul Mullowney    Notes:
23199ae82921SPaul Mullowney    If nnz is given then nz is ignored
23209ae82921SPaul Mullowney 
23219ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
23229ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
23239ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
23249ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
23259ae82921SPaul Mullowney 
23269ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
23270298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
23289ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
23299ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
23309ae82921SPaul Mullowney 
23319ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
23329ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
23339ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
23349ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
23359ae82921SPaul Mullowney 
23369ae82921SPaul Mullowney    Level: intermediate
23379ae82921SPaul Mullowney 
2338e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
23399ae82921SPaul Mullowney @*/
23409ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
23419ae82921SPaul Mullowney {
23429ae82921SPaul Mullowney   PetscErrorCode ierr;
23439ae82921SPaul Mullowney 
23449ae82921SPaul Mullowney   PetscFunctionBegin;
23459ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
23469ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
23479ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
23489ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
23499ae82921SPaul Mullowney   PetscFunctionReturn(0);
23509ae82921SPaul Mullowney }
23519ae82921SPaul Mullowney 
23526fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
23539ae82921SPaul Mullowney {
23549ae82921SPaul Mullowney   PetscErrorCode              ierr;
2355*3fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
2356ab25e6cbSDominic Meiser 
23579ae82921SPaul Mullowney   PetscFunctionBegin;
23589ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
2359*3fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2360*3fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2361470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
23629ae82921SPaul Mullowney   } else {
2363470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2364aa372e3fSPaul Mullowney   }
2365*3fa6b06aSMark Adams   if (d_mat) {
2366*3fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
2367*3fa6b06aSMark Adams     cudaError_t                err;
2368*3fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
2369*3fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2370*3fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2371*3fa6b06aSMark Adams     if (h_mat.seq) {
2372*3fa6b06aSMark Adams       if (a->compressedrow.use) {
2373*3fa6b06aSMark Adams  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2374*3fa6b06aSMark Adams       }
2375*3fa6b06aSMark Adams       err = cudaFree(d_mat);CHKERRCUDA(err);
2376*3fa6b06aSMark Adams     }
2377*3fa6b06aSMark Adams   }
2378ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2379ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2380ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2381ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
23829ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
23839ae82921SPaul Mullowney   PetscFunctionReturn(0);
23849ae82921SPaul Mullowney }
23859ae82921SPaul Mullowney 
2386ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
238795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
23889ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
23899ff858a8SKarl Rupp {
23909ff858a8SKarl Rupp   PetscErrorCode ierr;
23919ff858a8SKarl Rupp 
23929ff858a8SKarl Rupp   PetscFunctionBegin;
23939ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2394ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
23959ff858a8SKarl Rupp   PetscFunctionReturn(0);
23969ff858a8SKarl Rupp }
23979ff858a8SKarl Rupp 
239895639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
239995639643SRichard Tran Mills {
2400e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2401e6e9a74fSStefano Zampini 
240295639643SRichard Tran Mills   PetscFunctionBegin;
2403e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2404c34f1ff0SRichard Tran Mills   /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU.
240595639643SRichard Tran Mills      If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one.
2406e6e9a74fSStefano Zampini      Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case.
240795639643SRichard Tran Mills      TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries;
240895639643SRichard Tran Mills            can follow the example of MatBindToCPU_SeqAIJViennaCL(). */
2409e6e9a74fSStefano Zampini   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov");
2410e6e9a74fSStefano Zampini   /* TODO: add support for this? */
241195639643SRichard Tran Mills   if (flg) {
241295639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
241395639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2414c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2415c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2416e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
2417e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
2418e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2419e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
242095639643SRichard Tran Mills   } else {
242195639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
242295639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
242395639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
242495639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2425e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2426e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2427e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2428e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
242995639643SRichard Tran Mills   }
243095639643SRichard Tran Mills   A->boundtocpu = flg;
243195639643SRichard Tran Mills   PetscFunctionReturn(0);
243295639643SRichard Tran Mills }
243395639643SRichard Tran Mills 
2434*3fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
2435*3fa6b06aSMark Adams {
2436*3fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
2437*3fa6b06aSMark Adams   PetscErrorCode               ierr;
2438*3fa6b06aSMark Adams   PetscFunctionBegin;
2439*3fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
2440*3fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2441*3fa6b06aSMark Adams     d_mat = spptr->deviceMat;
2442*3fa6b06aSMark Adams   }
2443*3fa6b06aSMark Adams   if (d_mat) {
2444*3fa6b06aSMark Adams     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
2445*3fa6b06aSMark Adams     PetscInt     n = A->rmap->n, nnz = a->i[n];
2446*3fa6b06aSMark Adams     cudaError_t  err;
2447*3fa6b06aSMark Adams     PetscScalar  *vals;
2448*3fa6b06aSMark Adams     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
2449*3fa6b06aSMark Adams     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2450*3fa6b06aSMark Adams     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
2451*3fa6b06aSMark Adams   }
2452*3fa6b06aSMark Adams   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2453*3fa6b06aSMark Adams 
2454*3fa6b06aSMark Adams   A->offloadmask = PETSC_OFFLOAD_BOTH;
2455*3fa6b06aSMark Adams 
2456*3fa6b06aSMark Adams   PetscFunctionReturn(0);
2457*3fa6b06aSMark Adams }
2458*3fa6b06aSMark Adams 
245949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
24609ae82921SPaul Mullowney {
24619ae82921SPaul Mullowney   PetscErrorCode   ierr;
2462aa372e3fSPaul Mullowney   cusparseStatus_t stat;
246349735bf3SStefano Zampini   Mat              B;
24649ae82921SPaul Mullowney 
24659ae82921SPaul Mullowney   PetscFunctionBegin;
246649735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
246749735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
246849735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
246949735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
247049735bf3SStefano Zampini   }
247149735bf3SStefano Zampini   B = *newmat;
247249735bf3SStefano Zampini 
247334136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
247434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
247534136279SStefano Zampini 
247649735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
24779ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2478e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2479e6e9a74fSStefano Zampini 
2480e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2481e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2482e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2483e6e9a74fSStefano Zampini       B->spptr = spptr;
2484*3fa6b06aSMark Adams       spptr->deviceMat = NULL;
24859ae82921SPaul Mullowney     } else {
2486e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2487e6e9a74fSStefano Zampini 
2488e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2489e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2490e6e9a74fSStefano Zampini       B->spptr = spptr;
24919ae82921SPaul Mullowney     }
2492e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
249349735bf3SStefano Zampini   }
2494693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
24959ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
24969ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
249795639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2498693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
2499*3fa6b06aSMark Adams   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
25002205254eSKarl Rupp 
2501e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
25029ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2503bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
25049ae82921SPaul Mullowney   PetscFunctionReturn(0);
25059ae82921SPaul Mullowney }
25069ae82921SPaul Mullowney 
250702fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
250802fe1965SBarry Smith {
250902fe1965SBarry Smith   PetscErrorCode ierr;
251002fe1965SBarry Smith 
251102fe1965SBarry Smith   PetscFunctionBegin;
251205035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
251302fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
25140ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2515afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2516afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2517afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
251802fe1965SBarry Smith   PetscFunctionReturn(0);
251902fe1965SBarry Smith }
252002fe1965SBarry Smith 
25213ca39a21SBarry Smith /*MC
2522e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2523e057df02SPaul Mullowney 
2524e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
25252692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
25262692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2527e057df02SPaul Mullowney 
2528e057df02SPaul Mullowney    Options Database Keys:
2529e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2530aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2531a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2532e057df02SPaul Mullowney 
2533e057df02SPaul Mullowney   Level: beginner
2534e057df02SPaul Mullowney 
25358468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2536e057df02SPaul Mullowney M*/
25377f756511SDominic Meiser 
253842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
253942c9c57cSBarry Smith 
25400f39cd5aSBarry Smith 
25413ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
254242c9c57cSBarry Smith {
254342c9c57cSBarry Smith   PetscErrorCode ierr;
254442c9c57cSBarry Smith 
254542c9c57cSBarry Smith   PetscFunctionBegin;
25463ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
25473ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
25483ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
25493ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
255042c9c57cSBarry Smith   PetscFunctionReturn(0);
255142c9c57cSBarry Smith }
255229b38603SBarry Smith 
2553470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
25547f756511SDominic Meiser {
2555e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
25567f756511SDominic Meiser   cusparseStatus_t stat;
25577f756511SDominic Meiser   cusparseHandle_t handle;
25587f756511SDominic Meiser 
25597f756511SDominic Meiser   PetscFunctionBegin;
25607f756511SDominic Meiser   if (*cusparsestruct) {
2561e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2562e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
25637f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
256481902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
2565afb2bd1cSJunchao Zhang     if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);}
2566afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2567afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2568afb2bd1cSJunchao Zhang    #endif
2569e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
25707f756511SDominic Meiser   }
25717f756511SDominic Meiser   PetscFunctionReturn(0);
25727f756511SDominic Meiser }
25737f756511SDominic Meiser 
25747f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
25757f756511SDominic Meiser {
25767f756511SDominic Meiser   PetscFunctionBegin;
25777f756511SDominic Meiser   if (*mat) {
25787f756511SDominic Meiser     delete (*mat)->values;
25797f756511SDominic Meiser     delete (*mat)->column_indices;
25807f756511SDominic Meiser     delete (*mat)->row_offsets;
25817f756511SDominic Meiser     delete *mat;
25827f756511SDominic Meiser     *mat = 0;
25837f756511SDominic Meiser   }
25847f756511SDominic Meiser   PetscFunctionReturn(0);
25857f756511SDominic Meiser }
25867f756511SDominic Meiser 
2587470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
25887f756511SDominic Meiser {
25897f756511SDominic Meiser   cusparseStatus_t stat;
25907f756511SDominic Meiser   PetscErrorCode   ierr;
25917f756511SDominic Meiser 
25927f756511SDominic Meiser   PetscFunctionBegin;
25937f756511SDominic Meiser   if (*trifactor) {
259457d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2595afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
25967f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2597afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2598afb2bd1cSJunchao Zhang     cudaError_t cerr;
2599afb2bd1cSJunchao Zhang     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2600afb2bd1cSJunchao Zhang     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2601afb2bd1cSJunchao Zhang    #endif
26027f756511SDominic Meiser     delete *trifactor;
26037f756511SDominic Meiser     *trifactor = 0;
26047f756511SDominic Meiser   }
26057f756511SDominic Meiser   PetscFunctionReturn(0);
26067f756511SDominic Meiser }
26077f756511SDominic Meiser 
2608470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
26097f756511SDominic Meiser {
26107f756511SDominic Meiser   CsrMatrix        *mat;
26117f756511SDominic Meiser   cusparseStatus_t stat;
26127f756511SDominic Meiser   cudaError_t      err;
26137f756511SDominic Meiser 
26147f756511SDominic Meiser   PetscFunctionBegin;
26157f756511SDominic Meiser   if (*matstruct) {
26167f756511SDominic Meiser     if ((*matstruct)->mat) {
26177f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2618afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2619afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2620afb2bd1cSJunchao Zhang        #else
26217f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
262257d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2623afb2bd1cSJunchao Zhang        #endif
26247f756511SDominic Meiser       } else {
26257f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
26267f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
26277f756511SDominic Meiser       }
26287f756511SDominic Meiser     }
262957d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
26307f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2631afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
26327656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
26337656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2634afb2bd1cSJunchao Zhang 
2635afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2636afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2637afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2638afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
2639afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
2640afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2641afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2642afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2643afb2bd1cSJunchao Zhang       }
2644afb2bd1cSJunchao Zhang     }
2645afb2bd1cSJunchao Zhang    #endif
26467f756511SDominic Meiser     delete *matstruct;
26477f756511SDominic Meiser     *matstruct = 0;
26487f756511SDominic Meiser   }
26497f756511SDominic Meiser   PetscFunctionReturn(0);
26507f756511SDominic Meiser }
26517f756511SDominic Meiser 
2652ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
26537f756511SDominic Meiser {
2654e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2655e6e9a74fSStefano Zampini 
26567f756511SDominic Meiser   PetscFunctionBegin;
26577f756511SDominic Meiser   if (*trifactors) {
2658e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2659e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2660e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2661e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
26627f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
26637f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
26647f756511SDominic Meiser     delete (*trifactors)->workVector;
2665ccdfe979SStefano Zampini     (*trifactors)->rpermIndices = 0;
2666ccdfe979SStefano Zampini     (*trifactors)->cpermIndices = 0;
2667ccdfe979SStefano Zampini     (*trifactors)->workVector = 0;
2668ccdfe979SStefano Zampini   }
2669ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2670ccdfe979SStefano Zampini }
2671ccdfe979SStefano Zampini 
2672ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2673ccdfe979SStefano Zampini {
2674e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2675ccdfe979SStefano Zampini   cusparseHandle_t handle;
2676ccdfe979SStefano Zampini   cusparseStatus_t stat;
2677ccdfe979SStefano Zampini 
2678ccdfe979SStefano Zampini   PetscFunctionBegin;
2679ccdfe979SStefano Zampini   if (*trifactors) {
2680e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
26817f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
268257d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
26837f756511SDominic Meiser     }
2684e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
26857f756511SDominic Meiser   }
26867f756511SDominic Meiser   PetscFunctionReturn(0);
26877f756511SDominic Meiser }
2688