xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 7e8381f984f038c2461ffb95eefd9a80face03cb)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
749ae82921SPaul Mullowney 
757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
817f756511SDominic Meiser 
82*7e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
83*7e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
84*7e8381f9SStefano Zampini 
85b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
86b06137fdSPaul Mullowney {
87b06137fdSPaul Mullowney   cusparseStatus_t   stat;
88b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
89b06137fdSPaul Mullowney 
90b06137fdSPaul Mullowney   PetscFunctionBegin;
91d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
92b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
9357d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
94b06137fdSPaul Mullowney   PetscFunctionReturn(0);
95b06137fdSPaul Mullowney }
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
98b06137fdSPaul Mullowney {
99b06137fdSPaul Mullowney   cusparseStatus_t   stat;
100b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101b06137fdSPaul Mullowney 
102b06137fdSPaul Mullowney   PetscFunctionBegin;
103d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1046b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10516a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
10657d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
10716a2e217SAlejandro Lamas Daviña     }
108b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1096b1cf21dSAlejandro Lamas Daviña   }
11057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111b06137fdSPaul Mullowney   PetscFunctionReturn(0);
112b06137fdSPaul Mullowney }
113b06137fdSPaul Mullowney 
114b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115b06137fdSPaul Mullowney {
116b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
117*7e8381f9SStefano Zampini   PetscBool          flg;
118*7e8381f9SStefano Zampini   PetscErrorCode     ierr;
119ccdfe979SStefano Zampini 
120b06137fdSPaul Mullowney   PetscFunctionBegin;
121*7e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
122*7e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
123ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124b06137fdSPaul Mullowney   PetscFunctionReturn(0);
125b06137fdSPaul Mullowney }
126b06137fdSPaul Mullowney 
127ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1289ae82921SPaul Mullowney {
1299ae82921SPaul Mullowney   PetscFunctionBegin;
1309ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1319ae82921SPaul Mullowney   PetscFunctionReturn(0);
1329ae82921SPaul Mullowney }
1339ae82921SPaul Mullowney 
134c708e6cdSJed Brown /*MC
135087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
141c708e6cdSJed Brown 
1429ae82921SPaul Mullowney   Level: beginner
143c708e6cdSJed Brown 
1443ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145c708e6cdSJed Brown M*/
1469ae82921SPaul Mullowney 
14742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1489ae82921SPaul Mullowney {
1499ae82921SPaul Mullowney   PetscErrorCode ierr;
150bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1519ae82921SPaul Mullowney 
1529ae82921SPaul Mullowney   PetscFunctionBegin;
153bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
154bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1552c7c0729SBarry Smith   (*B)->factortype = ftype;
1562c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1579ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1582205254eSKarl Rupp 
159087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16033d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1619ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1629ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1669ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
167bc3f50f2SPaul Mullowney 
168fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1693ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1709ae82921SPaul Mullowney   PetscFunctionReturn(0);
1719ae82921SPaul Mullowney }
1729ae82921SPaul Mullowney 
173bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174ca45077fSPaul Mullowney {
175aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1766e111a19SKarl Rupp 
177ca45077fSPaul Mullowney   PetscFunctionBegin;
178ca45077fSPaul Mullowney   switch (op) {
179e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
180aa372e3fSPaul Mullowney     cusparsestruct->format = format;
181ca45077fSPaul Mullowney     break;
182e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
183aa372e3fSPaul Mullowney     cusparsestruct->format = format;
184ca45077fSPaul Mullowney     break;
185ca45077fSPaul Mullowney   default:
18636d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187ca45077fSPaul Mullowney   }
188ca45077fSPaul Mullowney   PetscFunctionReturn(0);
189ca45077fSPaul Mullowney }
1909ae82921SPaul Mullowney 
191e057df02SPaul Mullowney /*@
192e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
194aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
195e057df02SPaul Mullowney    Not Collective
196e057df02SPaul Mullowney 
197e057df02SPaul Mullowney    Input Parameters:
1988468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
19936d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2002692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
201e057df02SPaul Mullowney 
202e057df02SPaul Mullowney    Output Parameter:
203e057df02SPaul Mullowney 
204e057df02SPaul Mullowney    Level: intermediate
205e057df02SPaul Mullowney 
2068468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207e057df02SPaul Mullowney @*/
208e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209e057df02SPaul Mullowney {
210e057df02SPaul Mullowney   PetscErrorCode ierr;
2116e111a19SKarl Rupp 
212e057df02SPaul Mullowney   PetscFunctionBegin;
213e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
214e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
215e057df02SPaul Mullowney   PetscFunctionReturn(0);
216e057df02SPaul Mullowney }
217e057df02SPaul Mullowney 
218e6e9a74fSStefano Zampini /*@
219e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
220e6e9a74fSStefano Zampini 
221e6e9a74fSStefano Zampini    Collective on mat
222e6e9a74fSStefano Zampini 
223e6e9a74fSStefano Zampini    Input Parameters:
224e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
225e6e9a74fSStefano Zampini -  transgen - the boolean flag
226e6e9a74fSStefano Zampini 
227e6e9a74fSStefano Zampini    Level: intermediate
228e6e9a74fSStefano Zampini 
229e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
230e6e9a74fSStefano Zampini @*/
231e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232e6e9a74fSStefano Zampini {
233e6e9a74fSStefano Zampini   PetscErrorCode ierr;
234e6e9a74fSStefano Zampini   PetscBool      flg;
235e6e9a74fSStefano Zampini 
236e6e9a74fSStefano Zampini   PetscFunctionBegin;
237e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
238e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239e6e9a74fSStefano Zampini   if (flg) {
240e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24154da937aSStefano Zampini 
242e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243e6e9a74fSStefano Zampini     cusp->transgen = transgen;
24454da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
24554da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
24654da937aSStefano Zampini     }
247e6e9a74fSStefano Zampini   }
248e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
249e6e9a74fSStefano Zampini }
250e6e9a74fSStefano Zampini 
2514416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2529ae82921SPaul Mullowney {
2539ae82921SPaul Mullowney   PetscErrorCode           ierr;
254e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2559ae82921SPaul Mullowney   PetscBool                flg;
256a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2576e111a19SKarl Rupp 
2589ae82921SPaul Mullowney   PetscFunctionBegin;
259e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2609ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26154da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26254da937aSStefano Zampini 
26354da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
264afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
265afb2bd1cSJunchao Zhang 
266e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
268afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
269afb2bd1cSJunchao Zhang 
2704c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
273afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
277afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
279afb2bd1cSJunchao Zhang 
280afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
283afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
284afb2bd1cSJunchao Zhang 
285afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
288afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289afb2bd1cSJunchao Zhang    #endif
2904c87dfd4SPaul Mullowney   }
2910af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2929ae82921SPaul Mullowney   PetscFunctionReturn(0);
2939ae82921SPaul Mullowney }
2949ae82921SPaul Mullowney 
2956fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2969ae82921SPaul Mullowney {
2979ae82921SPaul Mullowney   PetscErrorCode ierr;
2989ae82921SPaul Mullowney 
2999ae82921SPaul Mullowney   PetscFunctionBegin;
3009ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3019ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3029ae82921SPaul Mullowney   PetscFunctionReturn(0);
3039ae82921SPaul Mullowney }
3049ae82921SPaul Mullowney 
3056fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3069ae82921SPaul Mullowney {
3079ae82921SPaul Mullowney   PetscErrorCode ierr;
3089ae82921SPaul Mullowney 
3099ae82921SPaul Mullowney   PetscFunctionBegin;
3109ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3119ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3129ae82921SPaul Mullowney   PetscFunctionReturn(0);
3139ae82921SPaul Mullowney }
3149ae82921SPaul Mullowney 
315087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
316087f3262SPaul Mullowney {
317087f3262SPaul Mullowney   PetscErrorCode ierr;
318087f3262SPaul Mullowney 
319087f3262SPaul Mullowney   PetscFunctionBegin;
320087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
321087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
322087f3262SPaul Mullowney   PetscFunctionReturn(0);
323087f3262SPaul Mullowney }
324087f3262SPaul Mullowney 
325087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326087f3262SPaul Mullowney {
327087f3262SPaul Mullowney   PetscErrorCode ierr;
328087f3262SPaul Mullowney 
329087f3262SPaul Mullowney   PetscFunctionBegin;
330087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332087f3262SPaul Mullowney   PetscFunctionReturn(0);
333087f3262SPaul Mullowney }
334087f3262SPaul Mullowney 
335087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3369ae82921SPaul Mullowney {
3379ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3389ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3399ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
340aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3419ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3429ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3439ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3449ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3459ae82921SPaul Mullowney   PetscScalar                       *AALo;
3469ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
347b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
34857d48284SJunchao Zhang   cudaError_t                       cerr;
3499ae82921SPaul Mullowney 
3509ae82921SPaul Mullowney   PetscFunctionBegin;
351cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
352c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3539ae82921SPaul Mullowney     try {
3549ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3559ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
3569ae82921SPaul Mullowney 
3579ae82921SPaul Mullowney       /* Allocate Space for the lower triangular matrix */
35857d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
35957d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
36057d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3619ae82921SPaul Mullowney 
3629ae82921SPaul Mullowney       /* Fill the lower triangular matrix */
3639ae82921SPaul Mullowney       AiLo[0]  = (PetscInt) 0;
3649ae82921SPaul Mullowney       AiLo[n]  = nzLower;
3659ae82921SPaul Mullowney       AjLo[0]  = (PetscInt) 0;
3669ae82921SPaul Mullowney       AALo[0]  = (MatScalar) 1.0;
3679ae82921SPaul Mullowney       v        = aa;
3689ae82921SPaul Mullowney       vi       = aj;
3699ae82921SPaul Mullowney       offset   = 1;
3709ae82921SPaul Mullowney       rowOffset= 1;
3719ae82921SPaul Mullowney       for (i=1; i<n; i++) {
3729ae82921SPaul Mullowney         nz = ai[i+1] - ai[i];
373e057df02SPaul Mullowney         /* additional 1 for the term on the diagonal */
3749ae82921SPaul Mullowney         AiLo[i]    = rowOffset;
3759ae82921SPaul Mullowney         rowOffset += nz+1;
3769ae82921SPaul Mullowney 
377580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
378580bdb30SBarry Smith         ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3799ae82921SPaul Mullowney 
3809ae82921SPaul Mullowney         offset      += nz;
3819ae82921SPaul Mullowney         AjLo[offset] = (PetscInt) i;
3829ae82921SPaul Mullowney         AALo[offset] = (MatScalar) 1.0;
3839ae82921SPaul Mullowney         offset      += 1;
3849ae82921SPaul Mullowney 
3859ae82921SPaul Mullowney         v  += nz;
3869ae82921SPaul Mullowney         vi += nz;
3879ae82921SPaul Mullowney       }
3882205254eSKarl Rupp 
389aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
390aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
3912205254eSKarl Rupp 
392aa372e3fSPaul Mullowney       /* Create the matrix description */
39357d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
39457d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
395afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
396afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
397afb2bd1cSJunchao Zhang      #else
39857d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
399afb2bd1cSJunchao Zhang      #endif
40057d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
40157d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
402aa372e3fSPaul Mullowney 
403aa372e3fSPaul Mullowney       /* set the operation */
404aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
405aa372e3fSPaul Mullowney 
406aa372e3fSPaul Mullowney       /* set the matrix */
407aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
408aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = n;
409aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = n;
410aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = nzLower;
411aa372e3fSPaul Mullowney 
412aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
413aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
414aa372e3fSPaul Mullowney 
415aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
416aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
417aa372e3fSPaul Mullowney 
418aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
419aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
420aa372e3fSPaul Mullowney 
421afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
422afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
423afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
424afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
425afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
426afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
427afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
428afb2bd1cSJunchao Zhang                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
429afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
430afb2bd1cSJunchao Zhang     #endif
431afb2bd1cSJunchao Zhang 
432aa372e3fSPaul Mullowney       /* perform the solve analysis */
433aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
434aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
435aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
436afb2bd1cSJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
437afb2bd1cSJunchao Zhang                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
438afb2bd1cSJunchao Zhang                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
439afb2bd1cSJunchao Zhang                              #endif
440afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
441aa372e3fSPaul Mullowney 
442aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
443aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4442205254eSKarl Rupp 
44557d48284SJunchao Zhang       cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
44657d48284SJunchao Zhang       cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
44757d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
4484863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
4499ae82921SPaul Mullowney     } catch(char *ex) {
4509ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4519ae82921SPaul Mullowney     }
4529ae82921SPaul Mullowney   }
4539ae82921SPaul Mullowney   PetscFunctionReturn(0);
4549ae82921SPaul Mullowney }
4559ae82921SPaul Mullowney 
456087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4579ae82921SPaul Mullowney {
4589ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4599ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4609ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
461aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4629ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4639ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4649ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4659ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4669ae82921SPaul Mullowney   PetscScalar                       *AAUp;
4679ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4689ae82921SPaul Mullowney   PetscErrorCode                    ierr;
46957d48284SJunchao Zhang   cudaError_t                       cerr;
4709ae82921SPaul Mullowney 
4719ae82921SPaul Mullowney   PetscFunctionBegin;
472cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
473c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4749ae82921SPaul Mullowney     try {
4759ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4769ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
4779ae82921SPaul Mullowney 
4789ae82921SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
47957d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
48057d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
48157d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
4829ae82921SPaul Mullowney 
4839ae82921SPaul Mullowney       /* Fill the upper triangular matrix */
4849ae82921SPaul Mullowney       AiUp[0]=(PetscInt) 0;
4859ae82921SPaul Mullowney       AiUp[n]=nzUpper;
4869ae82921SPaul Mullowney       offset = nzUpper;
4879ae82921SPaul Mullowney       for (i=n-1; i>=0; i--) {
4889ae82921SPaul Mullowney         v  = aa + adiag[i+1] + 1;
4899ae82921SPaul Mullowney         vi = aj + adiag[i+1] + 1;
4909ae82921SPaul Mullowney 
491e057df02SPaul Mullowney         /* number of elements NOT on the diagonal */
4929ae82921SPaul Mullowney         nz = adiag[i] - adiag[i+1]-1;
4939ae82921SPaul Mullowney 
494e057df02SPaul Mullowney         /* decrement the offset */
4959ae82921SPaul Mullowney         offset -= (nz+1);
4969ae82921SPaul Mullowney 
497e057df02SPaul Mullowney         /* first, set the diagonal elements */
4989ae82921SPaul Mullowney         AjUp[offset] = (PetscInt) i;
49909f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1./v[nz];
5009ae82921SPaul Mullowney         AiUp[i]      = AiUp[i+1] - (nz+1);
5019ae82921SPaul Mullowney 
502580bdb30SBarry Smith         ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
503580bdb30SBarry Smith         ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5049ae82921SPaul Mullowney       }
5052205254eSKarl Rupp 
506aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
507aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
5082205254eSKarl Rupp 
509aa372e3fSPaul Mullowney       /* Create the matrix description */
51057d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
51157d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
512afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
513afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
514afb2bd1cSJunchao Zhang      #else
51557d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
516afb2bd1cSJunchao Zhang      #endif
51757d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
51857d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
519aa372e3fSPaul Mullowney 
520aa372e3fSPaul Mullowney       /* set the operation */
521aa372e3fSPaul Mullowney       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
522aa372e3fSPaul Mullowney 
523aa372e3fSPaul Mullowney       /* set the matrix */
524aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
525aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = n;
526aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = n;
527aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = nzUpper;
528aa372e3fSPaul Mullowney 
529aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
530aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
531aa372e3fSPaul Mullowney 
532aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
533aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
534aa372e3fSPaul Mullowney 
535aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
536aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
537aa372e3fSPaul Mullowney 
538afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
539afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
540afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
541afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
542afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
543afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
544afb2bd1cSJunchao Zhang                                    upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
545afb2bd1cSJunchao Zhang                                    &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
546afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
547afb2bd1cSJunchao Zhang     #endif
548afb2bd1cSJunchao Zhang 
549aa372e3fSPaul Mullowney       /* perform the solve analysis */
550aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
551aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
552aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
553afb2bd1cSJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
554afb2bd1cSJunchao Zhang                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
555afb2bd1cSJunchao Zhang                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
556afb2bd1cSJunchao Zhang                              #endif
557afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
558aa372e3fSPaul Mullowney 
559aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
560aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
5612205254eSKarl Rupp 
56257d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
56357d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
56457d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
5654863603aSSatish Balay       ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
5669ae82921SPaul Mullowney     } catch(char *ex) {
5679ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5689ae82921SPaul Mullowney     }
5699ae82921SPaul Mullowney   }
5709ae82921SPaul Mullowney   PetscFunctionReturn(0);
5719ae82921SPaul Mullowney }
5729ae82921SPaul Mullowney 
573087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
5749ae82921SPaul Mullowney {
5759ae82921SPaul Mullowney   PetscErrorCode               ierr;
5769ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
5779ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
5789ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
5799ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
5809ae82921SPaul Mullowney   const PetscInt               *r,*c;
5819ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
5829ae82921SPaul Mullowney 
5839ae82921SPaul Mullowney   PetscFunctionBegin;
584ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
585087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
586087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
5872205254eSKarl Rupp 
588e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
589aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
5909ae82921SPaul Mullowney 
591c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
592e057df02SPaul Mullowney   /* lower triangular indices */
5939ae82921SPaul Mullowney   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5949ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5952205254eSKarl Rupp   if (!row_identity) {
596aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
597aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
5982205254eSKarl Rupp   }
5999ae82921SPaul Mullowney   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
6009ae82921SPaul Mullowney 
601e057df02SPaul Mullowney   /* upper triangular indices */
6029ae82921SPaul Mullowney   ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
6039ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
6042205254eSKarl Rupp   if (!col_identity) {
605aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
606aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6072205254eSKarl Rupp   }
6084863603aSSatish Balay 
6094863603aSSatish Balay   if (!row_identity && !col_identity) {
6104863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
6114863603aSSatish Balay   } else if(!row_identity) {
6124863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
6134863603aSSatish Balay   } else if(!col_identity) {
6144863603aSSatish Balay     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
6154863603aSSatish Balay   }
6164863603aSSatish Balay 
6179ae82921SPaul Mullowney   ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
6189ae82921SPaul Mullowney   PetscFunctionReturn(0);
6199ae82921SPaul Mullowney }
6209ae82921SPaul Mullowney 
621087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
622087f3262SPaul Mullowney {
623087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
624087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
625aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
626aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
627087f3262SPaul Mullowney   cusparseStatus_t                  stat;
628087f3262SPaul Mullowney   PetscErrorCode                    ierr;
62957d48284SJunchao Zhang   cudaError_t                       cerr;
630087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
631087f3262SPaul Mullowney   PetscScalar                       *AAUp;
632087f3262SPaul Mullowney   PetscScalar                       *AALo;
633087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
634087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
635087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
636087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
637087f3262SPaul Mullowney 
638087f3262SPaul Mullowney   PetscFunctionBegin;
639cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
640c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
641087f3262SPaul Mullowney     try {
642087f3262SPaul Mullowney       /* Allocate Space for the upper triangular matrix */
64357d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
64457d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
64557d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
64657d48284SJunchao Zhang       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
647087f3262SPaul Mullowney 
648087f3262SPaul Mullowney       /* Fill the upper triangular matrix */
649087f3262SPaul Mullowney       AiUp[0]=(PetscInt) 0;
650087f3262SPaul Mullowney       AiUp[n]=nzUpper;
651087f3262SPaul Mullowney       offset = 0;
652087f3262SPaul Mullowney       for (i=0; i<n; i++) {
653087f3262SPaul Mullowney         /* set the pointers */
654087f3262SPaul Mullowney         v  = aa + ai[i];
655087f3262SPaul Mullowney         vj = aj + ai[i];
656087f3262SPaul Mullowney         nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
657087f3262SPaul Mullowney 
658087f3262SPaul Mullowney         /* first, set the diagonal elements */
659087f3262SPaul Mullowney         AjUp[offset] = (PetscInt) i;
66009f51544SAlejandro Lamas Daviña         AAUp[offset] = (MatScalar)1.0/v[nz];
661087f3262SPaul Mullowney         AiUp[i]      = offset;
66209f51544SAlejandro Lamas Daviña         AALo[offset] = (MatScalar)1.0/v[nz];
663087f3262SPaul Mullowney 
664087f3262SPaul Mullowney         offset+=1;
665087f3262SPaul Mullowney         if (nz>0) {
666f22e0265SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
667580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
668087f3262SPaul Mullowney           for (j=offset; j<offset+nz; j++) {
669087f3262SPaul Mullowney             AAUp[j] = -AAUp[j];
670087f3262SPaul Mullowney             AALo[j] = AAUp[j]/v[nz];
671087f3262SPaul Mullowney           }
672087f3262SPaul Mullowney           offset+=nz;
673087f3262SPaul Mullowney         }
674087f3262SPaul Mullowney       }
675087f3262SPaul Mullowney 
676aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
677aa372e3fSPaul Mullowney       upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
678087f3262SPaul Mullowney 
679aa372e3fSPaul Mullowney       /* Create the matrix description */
68057d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
68157d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
682afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
683afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
684afb2bd1cSJunchao Zhang      #else
68557d48284SJunchao Zhang       stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
686afb2bd1cSJunchao Zhang      #endif
68757d48284SJunchao Zhang       stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
68857d48284SJunchao Zhang       stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
689087f3262SPaul Mullowney 
690aa372e3fSPaul Mullowney       /* set the matrix */
691aa372e3fSPaul Mullowney       upTriFactor->csrMat = new CsrMatrix;
692aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_rows = A->rmap->n;
693aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_cols = A->cmap->n;
694aa372e3fSPaul Mullowney       upTriFactor->csrMat->num_entries = a->nz;
695aa372e3fSPaul Mullowney 
696aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
697aa372e3fSPaul Mullowney       upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
698aa372e3fSPaul Mullowney 
699aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
700aa372e3fSPaul Mullowney       upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
701aa372e3fSPaul Mullowney 
702aa372e3fSPaul Mullowney       upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
703aa372e3fSPaul Mullowney       upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
704aa372e3fSPaul Mullowney 
705afb2bd1cSJunchao Zhang       /* set the operation */
706afb2bd1cSJunchao Zhang       upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
707afb2bd1cSJunchao Zhang 
708afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
709afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
710afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
711afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
712afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
713afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
714afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
715afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
716afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
717afb2bd1cSJunchao Zhang     #endif
718afb2bd1cSJunchao Zhang 
719aa372e3fSPaul Mullowney       /* perform the solve analysis */
720aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
721aa372e3fSPaul Mullowney                                upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
722aa372e3fSPaul Mullowney                                upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
723afb2bd1cSJunchao Zhang                                upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
724afb2bd1cSJunchao Zhang                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
725afb2bd1cSJunchao Zhang                                ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
726afb2bd1cSJunchao Zhang                               #endif
727afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
728aa372e3fSPaul Mullowney 
729aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
730aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
731aa372e3fSPaul Mullowney 
732aa372e3fSPaul Mullowney       /* allocate space for the triangular factor information */
733aa372e3fSPaul Mullowney       loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct;
734aa372e3fSPaul Mullowney 
735aa372e3fSPaul Mullowney       /* Create the matrix description */
73657d48284SJunchao Zhang       stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
73757d48284SJunchao Zhang       stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
738afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
739afb2bd1cSJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
740afb2bd1cSJunchao Zhang      #else
74157d48284SJunchao Zhang       stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
742afb2bd1cSJunchao Zhang      #endif
74357d48284SJunchao Zhang       stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
74457d48284SJunchao Zhang       stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
745aa372e3fSPaul Mullowney 
746aa372e3fSPaul Mullowney       /* set the operation */
747aa372e3fSPaul Mullowney       loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
748aa372e3fSPaul Mullowney 
749aa372e3fSPaul Mullowney       /* set the matrix */
750aa372e3fSPaul Mullowney       loTriFactor->csrMat = new CsrMatrix;
751aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_rows = A->rmap->n;
752aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_cols = A->cmap->n;
753aa372e3fSPaul Mullowney       loTriFactor->csrMat->num_entries = a->nz;
754aa372e3fSPaul Mullowney 
755aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
756aa372e3fSPaul Mullowney       loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
757aa372e3fSPaul Mullowney 
758aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
759aa372e3fSPaul Mullowney       loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
760aa372e3fSPaul Mullowney 
761aa372e3fSPaul Mullowney       loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
762aa372e3fSPaul Mullowney       loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
7634863603aSSatish Balay       ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
764aa372e3fSPaul Mullowney 
765afb2bd1cSJunchao Zhang       /* Create the solve analysis information */
766afb2bd1cSJunchao Zhang       stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
767afb2bd1cSJunchao Zhang     #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
768afb2bd1cSJunchao Zhang       stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
769afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
770afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
771afb2bd1cSJunchao Zhang                                      loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
772afb2bd1cSJunchao Zhang                                      &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
773afb2bd1cSJunchao Zhang       cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
774afb2bd1cSJunchao Zhang     #endif
775afb2bd1cSJunchao Zhang 
776aa372e3fSPaul Mullowney       /* perform the solve analysis */
777aa372e3fSPaul Mullowney       stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
778aa372e3fSPaul Mullowney                                loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
779aa372e3fSPaul Mullowney                                loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
780afb2bd1cSJunchao Zhang                                loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
781afb2bd1cSJunchao Zhang                               #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
782afb2bd1cSJunchao Zhang                                ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
783afb2bd1cSJunchao Zhang                               #endif
784afb2bd1cSJunchao Zhang                               );CHKERRCUSPARSE(stat);
785aa372e3fSPaul Mullowney 
786aa372e3fSPaul Mullowney       /* assign the pointer. Is this really necessary? */
787aa372e3fSPaul Mullowney       ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
788087f3262SPaul Mullowney 
789c70f7ee4SJunchao Zhang       A->offloadmask = PETSC_OFFLOAD_BOTH;
79057d48284SJunchao Zhang       cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
79157d48284SJunchao Zhang       cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
79257d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
79357d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
794087f3262SPaul Mullowney     } catch(char *ex) {
795087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
796087f3262SPaul Mullowney     }
797087f3262SPaul Mullowney   }
798087f3262SPaul Mullowney   PetscFunctionReturn(0);
799087f3262SPaul Mullowney }
800087f3262SPaul Mullowney 
801087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8029ae82921SPaul Mullowney {
8039ae82921SPaul Mullowney   PetscErrorCode               ierr;
804087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
805087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
806087f3262SPaul Mullowney   IS                           ip = a->row;
807087f3262SPaul Mullowney   const PetscInt               *rip;
808087f3262SPaul Mullowney   PetscBool                    perm_identity;
809087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
810087f3262SPaul Mullowney 
811087f3262SPaul Mullowney   PetscFunctionBegin;
812ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
813087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
814e65717acSKarl Rupp   cusparseTriFactors->workVector = new THRUSTARRAY(n);
815aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
816aa372e3fSPaul Mullowney 
817087f3262SPaul Mullowney   /* lower triangular indices */
818087f3262SPaul Mullowney   ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
819087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
820087f3262SPaul Mullowney   if (!perm_identity) {
8214e4bbfaaSStefano Zampini     IS             iip;
8224e4bbfaaSStefano Zampini     const PetscInt *irip;
8234e4bbfaaSStefano Zampini 
8244e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
8254e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
826aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
827aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
828aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
8294e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
8304e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
8314e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
8324863603aSSatish Balay     ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr);
833087f3262SPaul Mullowney   }
834087f3262SPaul Mullowney   ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
835087f3262SPaul Mullowney   PetscFunctionReturn(0);
836087f3262SPaul Mullowney }
837087f3262SPaul Mullowney 
8386fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
8399ae82921SPaul Mullowney {
8409ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
8419ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
8429ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
843b175d8bbSPaul Mullowney   PetscErrorCode ierr;
8449ae82921SPaul Mullowney 
8459ae82921SPaul Mullowney   PetscFunctionBegin;
8469ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
847ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
848e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
8499ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
8509ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
851bda325fcSPaul Mullowney   if (row_identity && col_identity) {
852bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
853bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8544e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8554e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
856bda325fcSPaul Mullowney   } else {
857bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
858bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
8594e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8604e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
861bda325fcSPaul Mullowney   }
8628dc1d2a3SPaul Mullowney 
863e057df02SPaul Mullowney   /* get the triangular factors */
864087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
8659ae82921SPaul Mullowney   PetscFunctionReturn(0);
8669ae82921SPaul Mullowney }
8679ae82921SPaul Mullowney 
868087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
869087f3262SPaul Mullowney {
870087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
871087f3262SPaul Mullowney   IS             ip = b->row;
872087f3262SPaul Mullowney   PetscBool      perm_identity;
873b175d8bbSPaul Mullowney   PetscErrorCode ierr;
874087f3262SPaul Mullowney 
875087f3262SPaul Mullowney   PetscFunctionBegin;
876087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
877ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
878087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
879087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
880087f3262SPaul Mullowney   if (perm_identity) {
881087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
882087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
8834e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8844e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
885087f3262SPaul Mullowney   } else {
886087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
887087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
8884e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
8894e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
890087f3262SPaul Mullowney   }
891087f3262SPaul Mullowney 
892087f3262SPaul Mullowney   /* get the triangular factors */
893087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
894087f3262SPaul Mullowney   PetscFunctionReturn(0);
895087f3262SPaul Mullowney }
8969ae82921SPaul Mullowney 
897b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
898bda325fcSPaul Mullowney {
899bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
900aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
901aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
902aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
903aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
904bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
905aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
906aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
907aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
908aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
909b175d8bbSPaul Mullowney 
910bda325fcSPaul Mullowney   PetscFunctionBegin;
911bda325fcSPaul Mullowney 
912aa372e3fSPaul Mullowney   /*********************************************/
913aa372e3fSPaul Mullowney   /* Now the Transpose of the Lower Tri Factor */
914aa372e3fSPaul Mullowney   /*********************************************/
915aa372e3fSPaul Mullowney 
916aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
917aa372e3fSPaul Mullowney   loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
918aa372e3fSPaul Mullowney 
919aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
920aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
921aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
922aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
923aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
924aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
925aa372e3fSPaul Mullowney 
926aa372e3fSPaul Mullowney   /* Create the matrix description */
92757d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
92857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
92957d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
93057d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
93157d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
932aa372e3fSPaul Mullowney 
933aa372e3fSPaul Mullowney   /* set the operation */
934aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
935aa372e3fSPaul Mullowney 
936aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
937aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
938afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
939afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
940aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
941afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
942afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
943afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
944aa372e3fSPaul Mullowney 
945aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
946afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
947afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
948afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
949afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
950afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
951afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
952afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
953afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
954afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
955afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
956afb2bd1cSJunchao Zhang   cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
957afb2bd1cSJunchao Zhang #endif
958afb2bd1cSJunchao Zhang 
959aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
960aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
961aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
962aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
963aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
964aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
965afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
966afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
967afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
968afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
969afb2bd1cSJunchao Zhang                         #else
970afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
971afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
972afb2bd1cSJunchao Zhang                         #endif
973afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
974aa372e3fSPaul Mullowney 
975afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
976afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
977afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
978afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
979afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
980afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
981afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
982afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
983afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
984afb2bd1cSJunchao Zhang #endif
985afb2bd1cSJunchao Zhang 
986afb2bd1cSJunchao Zhang   /* perform the solve analysis */
987aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
988afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
989afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
990afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
991afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
992afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
993afb2bd1cSJunchao Zhang                           #endif
994afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
995aa372e3fSPaul Mullowney 
996aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
997aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
998aa372e3fSPaul Mullowney 
999aa372e3fSPaul Mullowney   /*********************************************/
1000aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1001aa372e3fSPaul Mullowney   /*********************************************/
1002aa372e3fSPaul Mullowney 
1003aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1004aa372e3fSPaul Mullowney   upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct;
1005aa372e3fSPaul Mullowney 
1006aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1007aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1008aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1009aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1010aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1011aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1012aa372e3fSPaul Mullowney 
1013aa372e3fSPaul Mullowney   /* Create the matrix description */
101457d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
101557d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
101657d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
101757d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
101857d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1019aa372e3fSPaul Mullowney 
1020aa372e3fSPaul Mullowney   /* set the operation */
1021aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1022aa372e3fSPaul Mullowney 
1023aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1024aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1025afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1026afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1027aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1028afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1029afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1030afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1031aa372e3fSPaul Mullowney 
1032aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1033afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1034afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1035afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1036afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1037afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1038afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1039afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1040afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1041afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1042afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1043afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1044afb2bd1cSJunchao Zhang #endif
1045afb2bd1cSJunchao Zhang 
1046aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1047aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1048aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1049aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1050aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1051aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1052afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1053afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1054afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1055afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1056afb2bd1cSJunchao Zhang                         #else
1057afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1058afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1059afb2bd1cSJunchao Zhang                         #endif
1060afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1061aa372e3fSPaul Mullowney 
1062afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1063afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1064afb2bd1cSJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1066afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1067afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1068afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1069afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1070afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1071afb2bd1cSJunchao Zhang   #endif
1072afb2bd1cSJunchao Zhang 
1073afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1074aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1075afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1076afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1077afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
1078afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1079afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1080afb2bd1cSJunchao Zhang                           #endif
1081afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1082aa372e3fSPaul Mullowney 
1083aa372e3fSPaul Mullowney   /* assign the pointer. Is this really necessary? */
1084aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1085bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1086bda325fcSPaul Mullowney }
1087bda325fcSPaul Mullowney 
1088b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1089bda325fcSPaul Mullowney {
1090aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1091aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1092aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1093bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1094bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1095aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1096b06137fdSPaul Mullowney   cudaError_t                  err;
109785ba7357SStefano Zampini   PetscErrorCode               ierr;
1098b175d8bbSPaul Mullowney 
1099bda325fcSPaul Mullowney   PetscFunctionBegin;
110085ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
110185ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
110285ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
110385ba7357SStefano Zampini   /* create cusparse matrix */
1104aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
110557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1106aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
110757d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
110857d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1109aa372e3fSPaul Mullowney 
1110b06137fdSPaul Mullowney   /* set alpha and beta */
1111afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
11127656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
11137656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1114afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
11157656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
11167656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
111757d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1118b06137fdSPaul Mullowney 
1119aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1120aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1121aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1122554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1123554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1124aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1125a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1126aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1127aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1128a3fdcf43SKarl Rupp 
112981902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
113081902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1131afb2bd1cSJunchao Zhang 
113281902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1133afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1134afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1135afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1136afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1137afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1138afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1139afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1140afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1141afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1142afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1143afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1144afb2bd1cSJunchao Zhang    #endif
1145afb2bd1cSJunchao Zhang 
1146a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1147a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1148aa372e3fSPaul Mullowney                             matrix->values->data().get(),
114981902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1150aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1151aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1152afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1153afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1154afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1155afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1156afb2bd1cSJunchao Zhang                           #else
1157afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1158afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1159afb2bd1cSJunchao Zhang                           #endif
1160afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
1161aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1162afb2bd1cSJunchao Zhang 
1163afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1165afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1166afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1167afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1168afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1169afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1170afb2bd1cSJunchao Zhang    #endif
1171aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1172afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1173afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1174afb2bd1cSJunchao Zhang    #else
1175aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
117651c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
117751c6d536SStefano Zampini     /* First convert HYB to CSR */
1178aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1179aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1180aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1181aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1182aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1183aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1184aa372e3fSPaul Mullowney 
1185aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1186aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1187aa372e3fSPaul Mullowney                             temp->values->data().get(),
1188aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
118957d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1190aa372e3fSPaul Mullowney 
1191aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1192aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1193aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1194aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1195aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1196aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1197aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1198aa372e3fSPaul Mullowney 
1199aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1200aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1201aa372e3fSPaul Mullowney                             temp->values->data().get(),
1202aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1203aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1204aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1205aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1206aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
120757d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1208aa372e3fSPaul Mullowney 
1209aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1210aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
121157d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1212aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1213aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1214aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1215aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1216aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1217aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
121857d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1219aa372e3fSPaul Mullowney 
1220aa372e3fSPaul Mullowney     /* assign the pointer */
1221aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1222aa372e3fSPaul Mullowney     /* delete temporaries */
1223aa372e3fSPaul Mullowney     if (tempT) {
1224aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1225aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1226aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1227aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1228087f3262SPaul Mullowney     }
1229aa372e3fSPaul Mullowney     if (temp) {
1230aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1231aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1232aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1233aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1234aa372e3fSPaul Mullowney     }
1235afb2bd1cSJunchao Zhang    #endif
1236aa372e3fSPaul Mullowney   }
123705035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
123885ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
123985ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1240213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1241213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1242aa372e3fSPaul Mullowney   /* assign the pointer */
1243aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1244bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1245bda325fcSPaul Mullowney }
1246bda325fcSPaul Mullowney 
12474e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
12486fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1249bda325fcSPaul Mullowney {
1250c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1251465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1252465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1253465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1254465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1255bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1256bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1257aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1258aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1259aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1260b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
126157d48284SJunchao Zhang   cudaError_t                           cerr;
1262bda325fcSPaul Mullowney 
1263bda325fcSPaul Mullowney   PetscFunctionBegin;
1264aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1265aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1266bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1267aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1268aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1269bda325fcSPaul Mullowney   }
1270bda325fcSPaul Mullowney 
1271bda325fcSPaul Mullowney   /* Get the GPU pointers */
1272c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1273c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1274c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1275c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1276bda325fcSPaul Mullowney 
12777a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1278aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1279c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1280c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1281c41cb2e2SAlejandro Lamas Daviña                xGPU);
1282aa372e3fSPaul Mullowney 
1283aa372e3fSPaul Mullowney   /* First, solve U */
1284aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1285afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
1286afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1287afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1288afb2bd1cSJunchao Zhang                       #endif
1289afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1290aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1291aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1292aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1293aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1294afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
1295afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1296afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1297afb2bd1cSJunchao Zhang                       #endif
1298afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1299aa372e3fSPaul Mullowney 
1300aa372e3fSPaul Mullowney   /* Then, solve L */
1301aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1302afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
1303afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1304afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1305afb2bd1cSJunchao Zhang                       #endif
1306afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1307aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1308aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1309aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1310aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1311afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1312afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1313afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1314afb2bd1cSJunchao Zhang                       #endif
1315afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1316aa372e3fSPaul Mullowney 
1317aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1318c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1319c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1320aa372e3fSPaul Mullowney                tempGPU->begin());
1321aa372e3fSPaul Mullowney 
1322aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1323c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1324bda325fcSPaul Mullowney 
1325bda325fcSPaul Mullowney   /* restore */
1326c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1327c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
132805035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1329661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1330958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1331bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1332bda325fcSPaul Mullowney }
1333bda325fcSPaul Mullowney 
13346fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1335bda325fcSPaul Mullowney {
1336465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1337465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1338bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1339bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1340aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1341aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1342aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1343b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
134457d48284SJunchao Zhang   cudaError_t                       cerr;
1345bda325fcSPaul Mullowney 
1346bda325fcSPaul Mullowney   PetscFunctionBegin;
1347aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1348aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1349bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1350aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1351aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1352bda325fcSPaul Mullowney   }
1353bda325fcSPaul Mullowney 
1354bda325fcSPaul Mullowney   /* Get the GPU pointers */
1355c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1356c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1357bda325fcSPaul Mullowney 
13587a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1359aa372e3fSPaul Mullowney   /* First, solve U */
1360aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1361afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
1362afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1363afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1364afb2bd1cSJunchao Zhang                       #endif
1365afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1366aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1367aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1368aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1369aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1370afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
1371afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1372afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1373afb2bd1cSJunchao Zhang                       #endif
1374afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1375aa372e3fSPaul Mullowney 
1376aa372e3fSPaul Mullowney   /* Then, solve L */
1377aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1378afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
1379afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1380afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1381afb2bd1cSJunchao Zhang                       #endif
1382afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1383aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1384aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1385aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1386aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1387afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1388afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1389afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1390afb2bd1cSJunchao Zhang                       #endif
1391afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1392bda325fcSPaul Mullowney 
1393bda325fcSPaul Mullowney   /* restore */
1394c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1395c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
139605035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1397661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1398958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1399bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1400bda325fcSPaul Mullowney }
1401bda325fcSPaul Mullowney 
14026fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
14039ae82921SPaul Mullowney {
1404465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1405465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1406465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1407465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
14089ae82921SPaul Mullowney   cusparseStatus_t                      stat;
14099ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1410aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1411aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1412aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1413b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
141457d48284SJunchao Zhang   cudaError_t                           cerr;
14159ae82921SPaul Mullowney 
14169ae82921SPaul Mullowney   PetscFunctionBegin;
1417ebc8f436SDominic Meiser 
1418e057df02SPaul Mullowney   /* Get the GPU pointers */
1419c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1420c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1421c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1422c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
14239ae82921SPaul Mullowney 
14247a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1425aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1426c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1427c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
14284e4bbfaaSStefano Zampini                tempGPU->begin());
1429aa372e3fSPaul Mullowney 
1430aa372e3fSPaul Mullowney   /* Next, solve L */
1431aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1432afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
1433afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1434afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1435afb2bd1cSJunchao Zhang                       #endif
1436afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1437aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1438aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1439aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1440aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1441afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1442afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1443afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1444afb2bd1cSJunchao Zhang                       #endif
1445afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1446aa372e3fSPaul Mullowney 
1447aa372e3fSPaul Mullowney   /* Then, solve U */
1448aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1449afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
1450afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1451afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1452afb2bd1cSJunchao Zhang                       #endif
1453afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1454aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1455aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1456aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1457aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1458afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
1459afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1460afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1461afb2bd1cSJunchao Zhang                       #endif
1462afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1463aa372e3fSPaul Mullowney 
14644e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
14654e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
14664e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
14674e4bbfaaSStefano Zampini                xGPU);
14689ae82921SPaul Mullowney 
1469c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1470c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
147105035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1472661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1473958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
14749ae82921SPaul Mullowney   PetscFunctionReturn(0);
14759ae82921SPaul Mullowney }
14769ae82921SPaul Mullowney 
14776fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
14789ae82921SPaul Mullowney {
1479465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1480465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
14819ae82921SPaul Mullowney   cusparseStatus_t                  stat;
14829ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1483aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1484aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1485aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1486b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
148757d48284SJunchao Zhang   cudaError_t                       cerr;
14889ae82921SPaul Mullowney 
14899ae82921SPaul Mullowney   PetscFunctionBegin;
1490e057df02SPaul Mullowney   /* Get the GPU pointers */
1491c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1492c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
14939ae82921SPaul Mullowney 
14947a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1495aa372e3fSPaul Mullowney   /* First, solve L */
1496aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1497afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
1498afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1499afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1500afb2bd1cSJunchao Zhang                       #endif
1501afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1502aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1503aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1504aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1505aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1506afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
1507afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1508afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1509afb2bd1cSJunchao Zhang                       #endif
1510afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1511aa372e3fSPaul Mullowney 
1512aa372e3fSPaul Mullowney   /* Next, solve U */
1513aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1514afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
1515afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1516afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1517afb2bd1cSJunchao Zhang                       #endif
1518afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1519aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1520aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1521aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1522aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1523afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
1524afb2bd1cSJunchao Zhang                       #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1525afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1526afb2bd1cSJunchao Zhang                       #endif
1527afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
15289ae82921SPaul Mullowney 
1529c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1530c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
153105035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1532661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1533958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15349ae82921SPaul Mullowney   PetscFunctionReturn(0);
15359ae82921SPaul Mullowney }
15369ae82921SPaul Mullowney 
1537*7e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
1538*7e8381f9SStefano Zampini {
1539*7e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
1540*7e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1541*7e8381f9SStefano Zampini   cudaError_t        cerr;
1542*7e8381f9SStefano Zampini   PetscErrorCode     ierr;
1543*7e8381f9SStefano Zampini 
1544*7e8381f9SStefano Zampini   PetscFunctionBegin;
1545*7e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
1546*7e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
1547*7e8381f9SStefano Zampini 
1548*7e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1549*7e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1550*7e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
1551*7e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
1552*7e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
1553*7e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
1554*7e8381f9SStefano Zampini   }
1555*7e8381f9SStefano Zampini   PetscFunctionReturn(0);
1556*7e8381f9SStefano Zampini }
1557*7e8381f9SStefano Zampini 
1558*7e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
1559*7e8381f9SStefano Zampini {
1560*7e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
1561*7e8381f9SStefano Zampini   PetscErrorCode ierr;
1562*7e8381f9SStefano Zampini 
1563*7e8381f9SStefano Zampini   PetscFunctionBegin;
1564*7e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1565*7e8381f9SStefano Zampini   *array = a->a;
1566*7e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
1567*7e8381f9SStefano Zampini   PetscFunctionReturn(0);
1568*7e8381f9SStefano Zampini }
1569*7e8381f9SStefano Zampini 
15706fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
15719ae82921SPaul Mullowney {
1572aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
15737c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
15749ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1575213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
15769ae82921SPaul Mullowney   PetscErrorCode               ierr;
1577aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1578b06137fdSPaul Mullowney   cudaError_t                  err;
15799ae82921SPaul Mullowney 
15809ae82921SPaul Mullowney   PetscFunctionBegin;
158195639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1582c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
158381902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
158481902715SJunchao Zhang       /* Copy values only */
1585afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1586afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
158785ba7357SStefano Zampini 
158885ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1589afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
159005035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
15914863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
159285ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
159381902715SJunchao Zhang 
159481902715SJunchao Zhang       /* Update matT when it was built before */
159581902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
159681902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1597afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
159885ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
159981902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1600afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1601afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
160281902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1603afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1604afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1605afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1606afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1607afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1608afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1609afb2bd1cSJunchao Zhang                           #else
1610afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1611afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1612afb2bd1cSJunchao Zhang                           #endif
1613afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
161405035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
161585ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
161681902715SJunchao Zhang       }
161734d6c7a5SJose E. Roman     } else {
161885ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
16197c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
16207c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
16217c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
162281902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
16239ae82921SPaul Mullowney       try {
16249ae82921SPaul Mullowney         if (a->compressedrow.use) {
16259ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
16269ae82921SPaul Mullowney           ii   = a->compressedrow.i;
16279ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
16289ae82921SPaul Mullowney         } else {
1629213423ffSJunchao Zhang           m    = A->rmap->n;
1630213423ffSJunchao Zhang           ii   = a->i;
1631e6e9a74fSStefano Zampini           ridx = NULL;
16329ae82921SPaul Mullowney         }
1633213423ffSJunchao Zhang         cusparsestruct->nrows = m;
16349ae82921SPaul Mullowney 
163585ba7357SStefano Zampini         /* create cusparse matrix */
1636aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
163757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
163857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
163957d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
16409ae82921SPaul Mullowney 
1641afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
16427656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
16437656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1644afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
16457656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
16467656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
164757d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1648b06137fdSPaul Mullowney 
1649aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1650aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1651aa372e3fSPaul Mullowney           /* set the matrix */
1652afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1653afb2bd1cSJunchao Zhang           mat->num_rows = m;
1654afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1655afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1656afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1657afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
16589ae82921SPaul Mullowney 
1659afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1660afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1661aa372e3fSPaul Mullowney 
1662afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1663afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1664aa372e3fSPaul Mullowney 
1665aa372e3fSPaul Mullowney           /* assign the pointer */
1666afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1667afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1668afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1669afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1670afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1671afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1672afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1673afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1674afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1675afb2bd1cSJunchao Zhang           }
1676afb2bd1cSJunchao Zhang          #endif
1677aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1678afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1679afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1680afb2bd1cSJunchao Zhang          #else
1681afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1682afb2bd1cSJunchao Zhang           mat->num_rows = m;
1683afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1684afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1685afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1686afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1687aa372e3fSPaul Mullowney 
1688afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1689afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1690aa372e3fSPaul Mullowney 
1691afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1692afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1693aa372e3fSPaul Mullowney 
1694aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
169557d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1696aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1697aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1698afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1699afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1700afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1701afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
170257d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1703aa372e3fSPaul Mullowney           /* assign the pointer */
1704aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1705aa372e3fSPaul Mullowney 
1706afb2bd1cSJunchao Zhang           if (mat) {
1707afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1708afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1709afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1710afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1711087f3262SPaul Mullowney           }
1712afb2bd1cSJunchao Zhang          #endif
1713087f3262SPaul Mullowney         }
1714ca45077fSPaul Mullowney 
1715aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1716213423ffSJunchao Zhang         if (a->compressedrow.use) {
1717213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1718aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1719aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1720213423ffSJunchao Zhang           tmp = m;
1721213423ffSJunchao Zhang         } else {
1722213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1723213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1724213423ffSJunchao Zhang           tmp = 0;
1725213423ffSJunchao Zhang         }
1726213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1727aa372e3fSPaul Mullowney 
1728aa372e3fSPaul Mullowney         /* assign the pointer */
1729aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
17309ae82921SPaul Mullowney       } catch(char *ex) {
17319ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
17329ae82921SPaul Mullowney       }
173305035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
173485ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
173534d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
173634d6c7a5SJose E. Roman     }
1737c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
17389ae82921SPaul Mullowney   }
17399ae82921SPaul Mullowney   PetscFunctionReturn(0);
17409ae82921SPaul Mullowney }
17419ae82921SPaul Mullowney 
1742c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1743aa372e3fSPaul Mullowney {
1744aa372e3fSPaul Mullowney   template <typename Tuple>
1745aa372e3fSPaul Mullowney   __host__ __device__
1746aa372e3fSPaul Mullowney   void operator()(Tuple t)
1747aa372e3fSPaul Mullowney   {
1748aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1749aa372e3fSPaul Mullowney   }
1750aa372e3fSPaul Mullowney };
1751aa372e3fSPaul Mullowney 
1752*7e8381f9SStefano Zampini struct VecCUDAEquals
1753*7e8381f9SStefano Zampini {
1754*7e8381f9SStefano Zampini   template <typename Tuple>
1755*7e8381f9SStefano Zampini   __host__ __device__
1756*7e8381f9SStefano Zampini   void operator()(Tuple t)
1757*7e8381f9SStefano Zampini   {
1758*7e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
1759*7e8381f9SStefano Zampini   }
1760*7e8381f9SStefano Zampini };
1761*7e8381f9SStefano Zampini 
1762e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1763e6e9a74fSStefano Zampini {
1764e6e9a74fSStefano Zampini   template <typename Tuple>
1765e6e9a74fSStefano Zampini   __host__ __device__
1766e6e9a74fSStefano Zampini   void operator()(Tuple t)
1767e6e9a74fSStefano Zampini   {
1768e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1769e6e9a74fSStefano Zampini   }
1770e6e9a74fSStefano Zampini };
1771e6e9a74fSStefano Zampini 
1772afb2bd1cSJunchao Zhang struct MatMatCusparse {
1773ccdfe979SStefano Zampini   PetscBool            cisdense;
1774ccdfe979SStefano Zampini   PetscScalar          *Bt;
1775ccdfe979SStefano Zampini   Mat                  X;
1776afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1777afb2bd1cSJunchao Zhang   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1778afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
1779afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
1780afb2bd1cSJunchao Zhang   size_t               spmmBufferSize;
1781afb2bd1cSJunchao Zhang   void                 *spmmBuffer;
1782afb2bd1cSJunchao Zhang   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1783afb2bd1cSJunchao Zhang #endif
1784afb2bd1cSJunchao Zhang };
1785ccdfe979SStefano Zampini 
1786ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1787ccdfe979SStefano Zampini {
1788ccdfe979SStefano Zampini   PetscErrorCode ierr;
1789ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1790ccdfe979SStefano Zampini   cudaError_t    cerr;
1791ccdfe979SStefano Zampini 
1792ccdfe979SStefano Zampini   PetscFunctionBegin;
1793ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1794afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1795afb2bd1cSJunchao Zhang   cusparseStatus_t stat;
1796afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1797afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1798afb2bd1cSJunchao Zhang   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1799afb2bd1cSJunchao Zhang  #endif
1800ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1801ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1802ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1803ccdfe979SStefano Zampini }
1804ccdfe979SStefano Zampini 
1805ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1806ccdfe979SStefano Zampini 
1807ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1808ccdfe979SStefano Zampini {
1809ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1810ccdfe979SStefano Zampini   Mat                          A,B;
1811afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1812ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1813ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1814ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1815ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1816ccdfe979SStefano Zampini   const PetscScalar            *barray;
1817ccdfe979SStefano Zampini   PetscScalar                  *carray;
1818ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1819ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1820ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1821ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1822afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1823ccdfe979SStefano Zampini 
1824ccdfe979SStefano Zampini   PetscFunctionBegin;
1825ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1826ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1827ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1828ccdfe979SStefano Zampini   A    = product->A;
1829ccdfe979SStefano Zampini   B    = product->B;
1830ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1831ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1832ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1833ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1834ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1835ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1836ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1837ccdfe979SStefano Zampini   switch (product->type) {
1838ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1839ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1840ccdfe979SStefano Zampini     mat = cusp->mat;
1841ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1842ccdfe979SStefano Zampini     m   = A->rmap->n;
1843ccdfe979SStefano Zampini     n   = B->cmap->n;
1844ccdfe979SStefano Zampini     break;
1845ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1846e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1847e6e9a74fSStefano Zampini       mat = cusp->mat;
1848e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1849e6e9a74fSStefano Zampini     } else {
1850ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1851ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1852ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1853e6e9a74fSStefano Zampini     }
1854ccdfe979SStefano Zampini     m = A->cmap->n;
1855ccdfe979SStefano Zampini     n = B->cmap->n;
1856ccdfe979SStefano Zampini     break;
1857ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1858ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1859ccdfe979SStefano Zampini     mat = cusp->mat;
1860ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1861ccdfe979SStefano Zampini     m   = A->rmap->n;
1862ccdfe979SStefano Zampini     n   = B->rmap->n;
1863ccdfe979SStefano Zampini     break;
1864ccdfe979SStefano Zampini   default:
1865ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1866ccdfe979SStefano Zampini   }
1867ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1868ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1869ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1870ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1871afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1872ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1873afb2bd1cSJunchao Zhang 
1874ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1875c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1876c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1877c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1878c8378d12SStefano Zampini   } else {
1879c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1880c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1881c8378d12SStefano Zampini   }
1882c8378d12SStefano Zampini 
1883c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1884afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1885afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1886afb2bd1cSJunchao Zhang   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1887afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1888afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1889afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
1890afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1891afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
1892afb2bd1cSJunchao Zhang     }
1893c8378d12SStefano Zampini 
1894afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1895afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1896afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1897afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
1898afb2bd1cSJunchao Zhang     }
1899afb2bd1cSJunchao Zhang 
1900afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
1901afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
1902afb2bd1cSJunchao Zhang                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1903afb2bd1cSJunchao Zhang                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
1904afb2bd1cSJunchao Zhang                               csrmat->values->data().get(),
1905afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1906afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1907afb2bd1cSJunchao Zhang     }
1908afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
1909afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1910afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
1911afb2bd1cSJunchao Zhang                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
1912afb2bd1cSJunchao Zhang     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1913afb2bd1cSJunchao Zhang     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
1914afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
1915afb2bd1cSJunchao Zhang   } else {
1916afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
1917afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
1918afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
1919afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
1920afb2bd1cSJunchao Zhang   }
1921afb2bd1cSJunchao Zhang 
1922afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
1923afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
1924afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
1925afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
1926afb2bd1cSJunchao Zhang                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
1927afb2bd1cSJunchao Zhang  #else
1928afb2bd1cSJunchao Zhang   PetscInt k;
1929afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
1930ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
1931ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
1932ccdfe979SStefano Zampini     cublasStatus_t cerr;
1933ccdfe979SStefano Zampini 
1934ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
1935ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
1936ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
1937ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
1938ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
1939ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
1940ccdfe979SStefano Zampini     blda = B->cmap->n;
1941afb2bd1cSJunchao Zhang     k    = B->cmap->n;
1942afb2bd1cSJunchao Zhang   } else {
1943afb2bd1cSJunchao Zhang     k    = B->rmap->n;
1944ccdfe979SStefano Zampini   }
1945ccdfe979SStefano Zampini 
1946afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
1947ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
1948afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
1949ccdfe979SStefano Zampini                            csrmat->values->data().get(),
1950ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
1951ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
1952ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
1953ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
1954afb2bd1cSJunchao Zhang  #endif
1955afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1956c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1957c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
1958ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
1959ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
1960ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1961ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
1962ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
1963ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1964ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
1965ccdfe979SStefano Zampini   } else {
1966ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
1967ccdfe979SStefano Zampini   }
1968ccdfe979SStefano Zampini   if (mmdata->cisdense) {
1969ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
1970ccdfe979SStefano Zampini   }
1971ccdfe979SStefano Zampini   if (!biscuda) {
1972ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
1973ccdfe979SStefano Zampini   }
1974ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1975ccdfe979SStefano Zampini }
1976ccdfe979SStefano Zampini 
1977ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1978ccdfe979SStefano Zampini {
1979ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
1980ccdfe979SStefano Zampini   Mat                A,B;
1981ccdfe979SStefano Zampini   PetscInt           m,n;
1982ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
1983ccdfe979SStefano Zampini   PetscErrorCode     ierr;
1984ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
1985ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
1986ccdfe979SStefano Zampini 
1987ccdfe979SStefano Zampini   PetscFunctionBegin;
1988ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1989ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
1990ccdfe979SStefano Zampini   A    = product->A;
1991ccdfe979SStefano Zampini   B    = product->B;
1992ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1993ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1994ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1995ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
1996ccdfe979SStefano Zampini   switch (product->type) {
1997ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1998ccdfe979SStefano Zampini     m = A->rmap->n;
1999ccdfe979SStefano Zampini     n = B->cmap->n;
2000ccdfe979SStefano Zampini     break;
2001ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2002ccdfe979SStefano Zampini     m = A->cmap->n;
2003ccdfe979SStefano Zampini     n = B->cmap->n;
2004ccdfe979SStefano Zampini     break;
2005ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2006ccdfe979SStefano Zampini     m = A->rmap->n;
2007ccdfe979SStefano Zampini     n = B->rmap->n;
2008ccdfe979SStefano Zampini     break;
2009ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2010ccdfe979SStefano Zampini     m = B->cmap->n;
2011ccdfe979SStefano Zampini     n = B->cmap->n;
2012ccdfe979SStefano Zampini     break;
2013ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2014ccdfe979SStefano Zampini     m = B->rmap->n;
2015ccdfe979SStefano Zampini     n = B->rmap->n;
2016ccdfe979SStefano Zampini     break;
2017ccdfe979SStefano Zampini   default:
2018ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2019ccdfe979SStefano Zampini   }
2020ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2021ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2022ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2023ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2024ccdfe979SStefano Zampini 
2025ccdfe979SStefano Zampini   /* product data */
2026ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2027ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2028afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2029afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2030ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2031afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2032ccdfe979SStefano Zampini   }
2033afb2bd1cSJunchao Zhang  #endif
2034ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2035ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2036ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2037ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2038ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2039ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2040ccdfe979SStefano Zampini     } else {
2041ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2042ccdfe979SStefano Zampini     }
2043ccdfe979SStefano Zampini   }
2044ccdfe979SStefano Zampini   C->product->data    = mmdata;
2045ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2046ccdfe979SStefano Zampini 
2047ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2048ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2049ccdfe979SStefano Zampini }
2050ccdfe979SStefano Zampini 
2051ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2052ccdfe979SStefano Zampini 
2053ccdfe979SStefano Zampini /* handles dense B */
2054ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2055ccdfe979SStefano Zampini {
2056ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
2057ccdfe979SStefano Zampini   PetscErrorCode ierr;
2058ccdfe979SStefano Zampini 
2059ccdfe979SStefano Zampini   PetscFunctionBegin;
2060ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2061ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2062ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
2063ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2064ccdfe979SStefano Zampini     PetscFunctionReturn(0);
2065ccdfe979SStefano Zampini   }
2066ccdfe979SStefano Zampini   switch (product->type) {
2067ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2068ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2069ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2070ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2071ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2072ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2073ccdfe979SStefano Zampini   default:
2074ccdfe979SStefano Zampini     break;
2075ccdfe979SStefano Zampini   }
2076ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2077ccdfe979SStefano Zampini }
2078ccdfe979SStefano Zampini 
20796fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
20809ae82921SPaul Mullowney {
2081b175d8bbSPaul Mullowney   PetscErrorCode ierr;
20829ae82921SPaul Mullowney 
20839ae82921SPaul Mullowney   PetscFunctionBegin;
2084e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2085e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2086e6e9a74fSStefano Zampini }
2087e6e9a74fSStefano Zampini 
2088e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2089e6e9a74fSStefano Zampini {
2090e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2091e6e9a74fSStefano Zampini 
2092e6e9a74fSStefano Zampini   PetscFunctionBegin;
2093e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2094e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2095e6e9a74fSStefano Zampini }
2096e6e9a74fSStefano Zampini 
2097e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2098e6e9a74fSStefano Zampini {
2099e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2100e6e9a74fSStefano Zampini 
2101e6e9a74fSStefano Zampini   PetscFunctionBegin;
2102e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2103e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2104e6e9a74fSStefano Zampini }
2105e6e9a74fSStefano Zampini 
2106e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2107e6e9a74fSStefano Zampini {
2108e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2109e6e9a74fSStefano Zampini 
2110e6e9a74fSStefano Zampini   PetscFunctionBegin;
2111e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
21129ae82921SPaul Mullowney   PetscFunctionReturn(0);
21139ae82921SPaul Mullowney }
21149ae82921SPaul Mullowney 
21156fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2116ca45077fSPaul Mullowney {
2117b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2118ca45077fSPaul Mullowney 
2119ca45077fSPaul Mullowney   PetscFunctionBegin;
2120e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2121ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2122ca45077fSPaul Mullowney }
2123ca45077fSPaul Mullowney 
2124afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2125e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
21269ae82921SPaul Mullowney {
21279ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2128aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
21299ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2130e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2131b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
213257d48284SJunchao Zhang   cudaError_t                  cerr;
2133aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2134e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2135e6e9a74fSStefano Zampini   PetscBool                    compressed;
2136afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2137afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2138afb2bd1cSJunchao Zhang #endif
21396e111a19SKarl Rupp 
21409ae82921SPaul Mullowney   PetscFunctionBegin;
2141e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2142e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2143afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2144e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2145e6e9a74fSStefano Zampini   }
214634d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
214734d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2148e6e9a74fSStefano Zampini   if (!trans) {
21499ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2150c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2151e6e9a74fSStefano Zampini   } else {
2152e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2153e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2154e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2155e6e9a74fSStefano Zampini     } else {
2156afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2157e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2158e6e9a74fSStefano Zampini     }
2159e6e9a74fSStefano Zampini   }
2160e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2161e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2162213423ffSJunchao Zhang 
2163e6e9a74fSStefano Zampini   try {
2164e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2165213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2166213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2167afb2bd1cSJunchao Zhang 
216885ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2169e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2170afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2171afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2172afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2173afb2bd1cSJunchao Zhang       */
2174e6e9a74fSStefano Zampini       xptr = xarray;
2175afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2176213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2177afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2178afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2179afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2180afb2bd1cSJunchao Zhang        */
2181afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2182afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2183afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2184afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2185afb2bd1cSJunchao Zhang       }
2186afb2bd1cSJunchao Zhang      #endif
2187e6e9a74fSStefano Zampini     } else {
2188afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2189afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2190afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2191afb2bd1cSJunchao Zhang        */
2192afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2193e6e9a74fSStefano Zampini       dptr = zarray;
2194e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2195afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2196e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2197e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2198e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2199e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2200e6e9a74fSStefano Zampini       }
2201afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2202afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2203afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2204afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2205afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2206afb2bd1cSJunchao Zhang       }
2207afb2bd1cSJunchao Zhang      #endif
2208e6e9a74fSStefano Zampini     }
22099ae82921SPaul Mullowney 
2210afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2211aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2212afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2213afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2214afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2215afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2216afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2217afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2218afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2219afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2220afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2221afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2222afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2223afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2224afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2225afb2bd1cSJunchao Zhang 
2226afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2227afb2bd1cSJunchao Zhang       } else {
2228afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2229afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2230afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2231afb2bd1cSJunchao Zhang       }
2232afb2bd1cSJunchao Zhang 
2233afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2234afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2235afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2236afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2237afb2bd1cSJunchao Zhang                                beta,
2238afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2239afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2240afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2241afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2242afb2bd1cSJunchao Zhang      #else
22437656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2244e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2245a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2246afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2247aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2248e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
224957d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2250afb2bd1cSJunchao Zhang      #endif
2251aa372e3fSPaul Mullowney     } else {
2252213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2253afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2254afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2255afb2bd1cSJunchao Zhang        #else
2256301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2257e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2258afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2259e6e9a74fSStefano Zampini                                  xptr, beta,
226057d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2261afb2bd1cSJunchao Zhang        #endif
2262a65300a6SPaul Mullowney       }
2263aa372e3fSPaul Mullowney     }
226405035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2265958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2266aa372e3fSPaul Mullowney 
2267e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2268213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2269213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2270213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2271e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2272213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
22737656d835SStefano Zampini         }
2274213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2275c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
22767656d835SStefano Zampini       }
22777656d835SStefano Zampini 
2278213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2279213423ffSJunchao Zhang       if (compressed) {
2280213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2281e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2282c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2283e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2284c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
228505035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2286958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2287e6e9a74fSStefano Zampini       }
2288e6e9a74fSStefano Zampini     } else {
2289e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2290e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2291e6e9a74fSStefano Zampini       }
2292e6e9a74fSStefano Zampini     }
2293e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2294213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2295213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
22969ae82921SPaul Mullowney   } catch(char *ex) {
22979ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
22989ae82921SPaul Mullowney   }
2299e6e9a74fSStefano Zampini   if (yy) {
2300958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2301e6e9a74fSStefano Zampini   } else {
2302e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2303e6e9a74fSStefano Zampini   }
23049ae82921SPaul Mullowney   PetscFunctionReturn(0);
23059ae82921SPaul Mullowney }
23069ae82921SPaul Mullowney 
23076fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2308ca45077fSPaul Mullowney {
2309b175d8bbSPaul Mullowney   PetscErrorCode ierr;
23106e111a19SKarl Rupp 
2311ca45077fSPaul Mullowney   PetscFunctionBegin;
2312e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2313ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2314ca45077fSPaul Mullowney }
2315ca45077fSPaul Mullowney 
23166fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
23179ae82921SPaul Mullowney {
23189ae82921SPaul Mullowney   PetscErrorCode              ierr;
23193fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
23203fa6b06aSMark Adams   PetscBool                   is_seq = PETSC_TRUE;
23213fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
23229ae82921SPaul Mullowney   PetscFunctionBegin;
2323bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
23243fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2325bc3f50f2SPaul Mullowney   }
23263fa6b06aSMark Adams   if (d_mat) {
23273fa6b06aSMark Adams     cudaError_t err;
23283fa6b06aSMark Adams     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
23293fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
23303fa6b06aSMark Adams     nnz_state = h_mat.nonzerostate;
23313fa6b06aSMark Adams     is_seq = h_mat.seq;
23323fa6b06aSMark Adams   }
23333fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
23343fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
23353fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
23363fa6b06aSMark Adams     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
23373fa6b06aSMark Adams   } else if (nnz_state > A->nonzerostate) {
23383fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
23393fa6b06aSMark Adams   }
23403fa6b06aSMark Adams 
23419ae82921SPaul Mullowney   PetscFunctionReturn(0);
23429ae82921SPaul Mullowney }
23439ae82921SPaul Mullowney 
23449ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2345e057df02SPaul Mullowney /*@
23469ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2347e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2348e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2349e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2350e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2351e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
23529ae82921SPaul Mullowney 
2353d083f849SBarry Smith    Collective
23549ae82921SPaul Mullowney 
23559ae82921SPaul Mullowney    Input Parameters:
23569ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
23579ae82921SPaul Mullowney .  m - number of rows
23589ae82921SPaul Mullowney .  n - number of columns
23599ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
23609ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
23610298fd71SBarry Smith          (possibly different for each row) or NULL
23629ae82921SPaul Mullowney 
23639ae82921SPaul Mullowney    Output Parameter:
23649ae82921SPaul Mullowney .  A - the matrix
23659ae82921SPaul Mullowney 
23669ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
23679ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
23689ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
23699ae82921SPaul Mullowney 
23709ae82921SPaul Mullowney    Notes:
23719ae82921SPaul Mullowney    If nnz is given then nz is ignored
23729ae82921SPaul Mullowney 
23739ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
23749ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
23759ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
23769ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
23779ae82921SPaul Mullowney 
23789ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
23790298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
23809ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
23819ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
23829ae82921SPaul Mullowney 
23839ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
23849ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
23859ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
23869ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
23879ae82921SPaul Mullowney 
23889ae82921SPaul Mullowney    Level: intermediate
23899ae82921SPaul Mullowney 
2390e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
23919ae82921SPaul Mullowney @*/
23929ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
23939ae82921SPaul Mullowney {
23949ae82921SPaul Mullowney   PetscErrorCode ierr;
23959ae82921SPaul Mullowney 
23969ae82921SPaul Mullowney   PetscFunctionBegin;
23979ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
23989ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
23999ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
24009ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
24019ae82921SPaul Mullowney   PetscFunctionReturn(0);
24029ae82921SPaul Mullowney }
24039ae82921SPaul Mullowney 
24046fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
24059ae82921SPaul Mullowney {
24069ae82921SPaul Mullowney   PetscErrorCode              ierr;
24073fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
2408ab25e6cbSDominic Meiser 
24099ae82921SPaul Mullowney   PetscFunctionBegin;
24109ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
24113fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
24123fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2413470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
24149ae82921SPaul Mullowney   } else {
2415470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2416aa372e3fSPaul Mullowney   }
24173fa6b06aSMark Adams   if (d_mat) {
24183fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
24193fa6b06aSMark Adams     cudaError_t                err;
24203fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
24213fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
24223fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
24233fa6b06aSMark Adams     if (h_mat.seq) {
24243fa6b06aSMark Adams       if (a->compressedrow.use) {
24253fa6b06aSMark Adams  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
24263fa6b06aSMark Adams       }
24273fa6b06aSMark Adams       err = cudaFree(d_mat);CHKERRCUDA(err);
24283fa6b06aSMark Adams     }
24293fa6b06aSMark Adams   }
2430ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2431ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2432ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2433ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
2434*7e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2435*7e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
24369ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
24379ae82921SPaul Mullowney   PetscFunctionReturn(0);
24389ae82921SPaul Mullowney }
24399ae82921SPaul Mullowney 
2440ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
244195639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
24429ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
24439ff858a8SKarl Rupp {
24449ff858a8SKarl Rupp   PetscErrorCode ierr;
24459ff858a8SKarl Rupp 
24469ff858a8SKarl Rupp   PetscFunctionBegin;
24479ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2448ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
24499ff858a8SKarl Rupp   PetscFunctionReturn(0);
24509ff858a8SKarl Rupp }
24519ff858a8SKarl Rupp 
245295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
245395639643SRichard Tran Mills {
2454c58ef05eSStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2455e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2456e6e9a74fSStefano Zampini 
245795639643SRichard Tran Mills   PetscFunctionBegin;
2458e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
245995639643SRichard Tran Mills   if (flg) {
2460*7e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2461*7e8381f9SStefano Zampini 
246295639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
246395639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2464c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2465c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2466e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
2467e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
2468e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2469e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2470*7e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2471*7e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2472*7e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
247395639643SRichard Tran Mills   } else {
247495639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
247595639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
247695639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
247795639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2478e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2479e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2480e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2481e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2482*7e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2483*7e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2484*7e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
248595639643SRichard Tran Mills   }
248695639643SRichard Tran Mills   A->boundtocpu = flg;
2487c58ef05eSStefano Zampini   a->inode.use = flg;
248895639643SRichard Tran Mills   PetscFunctionReturn(0);
248995639643SRichard Tran Mills }
249095639643SRichard Tran Mills 
24913fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
24923fa6b06aSMark Adams {
24933fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = NULL;
24943fa6b06aSMark Adams   PetscErrorCode             ierr;
2495*7e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
2496*7e8381f9SStefano Zampini 
24973fa6b06aSMark Adams   PetscFunctionBegin;
24983fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
24993fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
2500*7e8381f9SStefano Zampini     if (spptr->mat) {
2501*7e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
2502*7e8381f9SStefano Zampini       if (matrix->values) {
2503*7e8381f9SStefano Zampini         both = PETSC_TRUE;
2504*7e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2505*7e8381f9SStefano Zampini       }
2506*7e8381f9SStefano Zampini     }
2507*7e8381f9SStefano Zampini     if (spptr->matTranspose) {
2508*7e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
2509*7e8381f9SStefano Zampini       if (matrix->values) {
2510*7e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2511*7e8381f9SStefano Zampini       }
2512*7e8381f9SStefano Zampini     }
25133fa6b06aSMark Adams     d_mat = spptr->deviceMat;
25143fa6b06aSMark Adams   }
25153fa6b06aSMark Adams   if (d_mat) {
25163fa6b06aSMark Adams     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
25173fa6b06aSMark Adams     PetscInt     n = A->rmap->n, nnz = a->i[n];
25183fa6b06aSMark Adams     cudaError_t  err;
25193fa6b06aSMark Adams     PetscScalar  *vals;
25203fa6b06aSMark Adams     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
25213fa6b06aSMark Adams     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
25223fa6b06aSMark Adams     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
25233fa6b06aSMark Adams   }
25243fa6b06aSMark Adams   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2525*7e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
25263fa6b06aSMark Adams 
25273fa6b06aSMark Adams   PetscFunctionReturn(0);
25283fa6b06aSMark Adams }
25293fa6b06aSMark Adams 
253049735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
25319ae82921SPaul Mullowney {
25329ae82921SPaul Mullowney   PetscErrorCode   ierr;
2533aa372e3fSPaul Mullowney   cusparseStatus_t stat;
253449735bf3SStefano Zampini   Mat              B;
25359ae82921SPaul Mullowney 
25369ae82921SPaul Mullowney   PetscFunctionBegin;
253749735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
253849735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
253949735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
254049735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
254149735bf3SStefano Zampini   }
254249735bf3SStefano Zampini   B = *newmat;
254349735bf3SStefano Zampini 
254434136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
254534136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
254634136279SStefano Zampini 
254749735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
25489ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2549e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2550e6e9a74fSStefano Zampini 
2551e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2552e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2553e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2554e6e9a74fSStefano Zampini       B->spptr = spptr;
25553fa6b06aSMark Adams       spptr->deviceMat = NULL;
25569ae82921SPaul Mullowney     } else {
2557e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2558e6e9a74fSStefano Zampini 
2559e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2560e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2561e6e9a74fSStefano Zampini       B->spptr = spptr;
25629ae82921SPaul Mullowney     }
2563e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
256449735bf3SStefano Zampini   }
2565693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
25669ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
25679ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
256895639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2569693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
25703fa6b06aSMark Adams   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
25712205254eSKarl Rupp 
2572e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
25739ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2574bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
25759ae82921SPaul Mullowney   PetscFunctionReturn(0);
25769ae82921SPaul Mullowney }
25779ae82921SPaul Mullowney 
257802fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
257902fe1965SBarry Smith {
258002fe1965SBarry Smith   PetscErrorCode ierr;
258102fe1965SBarry Smith 
258202fe1965SBarry Smith   PetscFunctionBegin;
258305035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
258402fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
25850ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2586afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2587afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2588afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
258902fe1965SBarry Smith   PetscFunctionReturn(0);
259002fe1965SBarry Smith }
259102fe1965SBarry Smith 
25923ca39a21SBarry Smith /*MC
2593e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2594e057df02SPaul Mullowney 
2595e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
25962692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
25972692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2598e057df02SPaul Mullowney 
2599e057df02SPaul Mullowney    Options Database Keys:
2600e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2601aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2602a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2603e057df02SPaul Mullowney 
2604e057df02SPaul Mullowney   Level: beginner
2605e057df02SPaul Mullowney 
26068468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2607e057df02SPaul Mullowney M*/
26087f756511SDominic Meiser 
260942c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
261042c9c57cSBarry Smith 
26110f39cd5aSBarry Smith 
26123ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
261342c9c57cSBarry Smith {
261442c9c57cSBarry Smith   PetscErrorCode ierr;
261542c9c57cSBarry Smith 
261642c9c57cSBarry Smith   PetscFunctionBegin;
26173ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
26183ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
26193ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
26203ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
262142c9c57cSBarry Smith   PetscFunctionReturn(0);
262242c9c57cSBarry Smith }
262329b38603SBarry Smith 
2624470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
26257f756511SDominic Meiser {
2626e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
26277f756511SDominic Meiser   cusparseStatus_t stat;
26287f756511SDominic Meiser 
26297f756511SDominic Meiser   PetscFunctionBegin;
26307f756511SDominic Meiser   if (*cusparsestruct) {
2631e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2632e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
26337f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
263481902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
2635*7e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
2636*7e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
2637*7e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_v;
2638*7e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_w;
2639*7e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2640afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2641afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2642afb2bd1cSJunchao Zhang    #endif
2643e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
26447f756511SDominic Meiser   }
26457f756511SDominic Meiser   PetscFunctionReturn(0);
26467f756511SDominic Meiser }
26477f756511SDominic Meiser 
26487f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
26497f756511SDominic Meiser {
26507f756511SDominic Meiser   PetscFunctionBegin;
26517f756511SDominic Meiser   if (*mat) {
26527f756511SDominic Meiser     delete (*mat)->values;
26537f756511SDominic Meiser     delete (*mat)->column_indices;
26547f756511SDominic Meiser     delete (*mat)->row_offsets;
26557f756511SDominic Meiser     delete *mat;
26567f756511SDominic Meiser     *mat = 0;
26577f756511SDominic Meiser   }
26587f756511SDominic Meiser   PetscFunctionReturn(0);
26597f756511SDominic Meiser }
26607f756511SDominic Meiser 
2661470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
26627f756511SDominic Meiser {
26637f756511SDominic Meiser   cusparseStatus_t stat;
26647f756511SDominic Meiser   PetscErrorCode   ierr;
26657f756511SDominic Meiser 
26667f756511SDominic Meiser   PetscFunctionBegin;
26677f756511SDominic Meiser   if (*trifactor) {
266857d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2669afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
26707f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
2671afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2672afb2bd1cSJunchao Zhang     cudaError_t cerr;
2673afb2bd1cSJunchao Zhang     if ((*trifactor)->solveBuffer)   {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2674afb2bd1cSJunchao Zhang     if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2675afb2bd1cSJunchao Zhang    #endif
26767f756511SDominic Meiser     delete *trifactor;
2677*7e8381f9SStefano Zampini     *trifactor = NULL;
26787f756511SDominic Meiser   }
26797f756511SDominic Meiser   PetscFunctionReturn(0);
26807f756511SDominic Meiser }
26817f756511SDominic Meiser 
2682470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
26837f756511SDominic Meiser {
26847f756511SDominic Meiser   CsrMatrix        *mat;
26857f756511SDominic Meiser   cusparseStatus_t stat;
26867f756511SDominic Meiser   cudaError_t      err;
26877f756511SDominic Meiser 
26887f756511SDominic Meiser   PetscFunctionBegin;
26897f756511SDominic Meiser   if (*matstruct) {
26907f756511SDominic Meiser     if ((*matstruct)->mat) {
26917f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2692afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2693afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2694afb2bd1cSJunchao Zhang        #else
26957f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
269657d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2697afb2bd1cSJunchao Zhang        #endif
26987f756511SDominic Meiser       } else {
26997f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
27007f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
27017f756511SDominic Meiser       }
27027f756511SDominic Meiser     }
270357d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
27047f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2705afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
27067656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
27077656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2708afb2bd1cSJunchao Zhang 
2709afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2710afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2711afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2712afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
2713afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
2714afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2715afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2716afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2717afb2bd1cSJunchao Zhang       }
2718afb2bd1cSJunchao Zhang     }
2719afb2bd1cSJunchao Zhang    #endif
27207f756511SDominic Meiser     delete *matstruct;
2721*7e8381f9SStefano Zampini     *matstruct = NULL;
27227f756511SDominic Meiser   }
27237f756511SDominic Meiser   PetscFunctionReturn(0);
27247f756511SDominic Meiser }
27257f756511SDominic Meiser 
2726ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
27277f756511SDominic Meiser {
2728e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2729e6e9a74fSStefano Zampini 
27307f756511SDominic Meiser   PetscFunctionBegin;
27317f756511SDominic Meiser   if (*trifactors) {
2732e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2733e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2734e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2735e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
27367f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
27377f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
27387f756511SDominic Meiser     delete (*trifactors)->workVector;
2739*7e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
2740*7e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
2741*7e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
2742ccdfe979SStefano Zampini   }
2743ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2744ccdfe979SStefano Zampini }
2745ccdfe979SStefano Zampini 
2746ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2747ccdfe979SStefano Zampini {
2748e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2749ccdfe979SStefano Zampini   cusparseHandle_t handle;
2750ccdfe979SStefano Zampini   cusparseStatus_t stat;
2751ccdfe979SStefano Zampini 
2752ccdfe979SStefano Zampini   PetscFunctionBegin;
2753ccdfe979SStefano Zampini   if (*trifactors) {
2754e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
27557f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
275657d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
27577f756511SDominic Meiser     }
2758e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
27597f756511SDominic Meiser   }
27607f756511SDominic Meiser   PetscFunctionReturn(0);
27617f756511SDominic Meiser }
2762*7e8381f9SStefano Zampini 
2763*7e8381f9SStefano Zampini struct IJCompare
2764*7e8381f9SStefano Zampini {
2765*7e8381f9SStefano Zampini   __host__ __device__
2766*7e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2767*7e8381f9SStefano Zampini   {
2768*7e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
2769*7e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
2770*7e8381f9SStefano Zampini     return false;
2771*7e8381f9SStefano Zampini   }
2772*7e8381f9SStefano Zampini };
2773*7e8381f9SStefano Zampini 
2774*7e8381f9SStefano Zampini struct IJEqual
2775*7e8381f9SStefano Zampini {
2776*7e8381f9SStefano Zampini   __host__ __device__
2777*7e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
2778*7e8381f9SStefano Zampini   {
2779*7e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
2780*7e8381f9SStefano Zampini     return true;
2781*7e8381f9SStefano Zampini   }
2782*7e8381f9SStefano Zampini };
2783*7e8381f9SStefano Zampini 
2784*7e8381f9SStefano Zampini struct IJDiff
2785*7e8381f9SStefano Zampini {
2786*7e8381f9SStefano Zampini   __host__ __device__
2787*7e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2788*7e8381f9SStefano Zampini   {
2789*7e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
2790*7e8381f9SStefano Zampini   }
2791*7e8381f9SStefano Zampini };
2792*7e8381f9SStefano Zampini 
2793*7e8381f9SStefano Zampini struct IJSum
2794*7e8381f9SStefano Zampini {
2795*7e8381f9SStefano Zampini   __host__ __device__
2796*7e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
2797*7e8381f9SStefano Zampini   {
2798*7e8381f9SStefano Zampini     return t1||t2;
2799*7e8381f9SStefano Zampini   }
2800*7e8381f9SStefano Zampini };
2801*7e8381f9SStefano Zampini 
2802*7e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
2803*7e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
2804*7e8381f9SStefano Zampini {
2805*7e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2806*7e8381f9SStefano Zampini   CsrMatrix          *matrix;
2807*7e8381f9SStefano Zampini   PetscErrorCode     ierr;
2808*7e8381f9SStefano Zampini   cudaError_t        cerr;
2809*7e8381f9SStefano Zampini   PetscInt           n;
2810*7e8381f9SStefano Zampini 
2811*7e8381f9SStefano Zampini   PetscFunctionBegin;
2812*7e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
2813*7e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
2814*7e8381f9SStefano Zampini   if (!cusp->cooPerm) {
2815*7e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2816*7e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2817*7e8381f9SStefano Zampini     PetscFunctionReturn(0);
2818*7e8381f9SStefano Zampini   }
2819*7e8381f9SStefano Zampini   n = cusp->cooPerm->size();
2820*7e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
2821*7e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2822*7e8381f9SStefano Zampini   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
2823*7e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2824*7e8381f9SStefano Zampini   if (v) {
2825*7e8381f9SStefano Zampini     cusp->cooPerm_v->assign(v,v+n);
2826*7e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
2827*7e8381f9SStefano Zampini   }
2828*7e8381f9SStefano Zampini   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
2829*7e8381f9SStefano Zampini   if (imode == ADD_VALUES) {
2830*7e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
2831*7e8381f9SStefano Zampini       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
2832*7e8381f9SStefano Zampini       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
2833*7e8381f9SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
2834*7e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
2835*7e8381f9SStefano Zampini     } else {
2836*7e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2837*7e8381f9SStefano Zampini                                                                 matrix->values->begin()));
2838*7e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2839*7e8381f9SStefano Zampini                                                                 matrix->values->end()));
2840*7e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
2841*7e8381f9SStefano Zampini     }
2842*7e8381f9SStefano Zampini   } else {
2843*7e8381f9SStefano Zampini     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
2844*7e8381f9SStefano Zampini                               if we are inserting two different values into the same location */
2845*7e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2846*7e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
2847*7e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2848*7e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
2849*7e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
2850*7e8381f9SStefano Zampini     } else {
2851*7e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
2852*7e8381f9SStefano Zampini                                                                 matrix->values->begin()));
2853*7e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
2854*7e8381f9SStefano Zampini                                                                 matrix->values->end()));
2855*7e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
2856*7e8381f9SStefano Zampini     }
2857*7e8381f9SStefano Zampini   }
2858*7e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2859*7e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
2860*7e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
2861*7e8381f9SStefano Zampini   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2862*7e8381f9SStefano Zampini   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
2863*7e8381f9SStefano Zampini   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2864*7e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2865*7e8381f9SStefano Zampini   PetscFunctionReturn(0);
2866*7e8381f9SStefano Zampini }
2867*7e8381f9SStefano Zampini 
2868*7e8381f9SStefano Zampini #include <thrust/binary_search.h>
2869*7e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
2870*7e8381f9SStefano Zampini {
2871*7e8381f9SStefano Zampini   PetscErrorCode     ierr;
2872*7e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2873*7e8381f9SStefano Zampini   CsrMatrix          *matrix;
2874*7e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
2875*7e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
2876*7e8381f9SStefano Zampini   cudaError_t        cerr;
2877*7e8381f9SStefano Zampini 
2878*7e8381f9SStefano Zampini   PetscFunctionBegin;
2879*7e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2880*7e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
2881*7e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
2882*7e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
2883*7e8381f9SStefano Zampini   if (n != cooPerm_n) {
2884*7e8381f9SStefano Zampini     delete cusp->cooPerm;
2885*7e8381f9SStefano Zampini     delete cusp->cooPerm_v;
2886*7e8381f9SStefano Zampini     delete cusp->cooPerm_w;
2887*7e8381f9SStefano Zampini     delete cusp->cooPerm_a;
2888*7e8381f9SStefano Zampini     cusp->cooPerm = NULL;
2889*7e8381f9SStefano Zampini     cusp->cooPerm_v = NULL;
2890*7e8381f9SStefano Zampini     cusp->cooPerm_w = NULL;
2891*7e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
2892*7e8381f9SStefano Zampini   }
2893*7e8381f9SStefano Zampini   if (n) {
2894*7e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
2895*7e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
2896*7e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
2897*7e8381f9SStefano Zampini 
2898*7e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
2899*7e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
2900*7e8381f9SStefano Zampini 
2901*7e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
2902*7e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
2903*7e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
2904*7e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
2905*7e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
2906*7e8381f9SStefano Zampini 
2907*7e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
2908*7e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
2909*7e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
2910*7e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
2911*7e8381f9SStefano Zampini 
2912*7e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
2913*7e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
2914*7e8381f9SStefano Zampini       delete cusp->cooPerm_a;
2915*7e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
2916*7e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
2917*7e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
2918*7e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
2919*7e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
2920*7e8381f9SStefano Zampini       w[0] = 0;
2921*7e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
2922*7e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
2923*7e8381f9SStefano Zampini     }
2924*7e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
2925*7e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
2926*7e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
2927*7e8381f9SStefano Zampini                         ii.begin());
2928*7e8381f9SStefano Zampini 
2929*7e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
2930*7e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
2931*7e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
2932*7e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
2933*7e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
2934*7e8381f9SStefano Zampini     a->i[0] = 0;
2935*7e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2936*7e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
2937*7e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
2938*7e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
2939*7e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2940*7e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
2941*7e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
2942*7e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
2943*7e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
2944*7e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
2945*7e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
2946*7e8381f9SStefano Zampini     }
2947*7e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
2948*7e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
2949*7e8381f9SStefano Zampini   } else {
2950*7e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
2951*7e8381f9SStefano Zampini   }
2952*7e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2953*7e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
2954*7e8381f9SStefano Zampini 
2955*7e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
2956*7e8381f9SStefano Zampini      The code is so convoluted now that I prefer to copy garbage to the GPU */
2957*7e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
2958*7e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
2959*7e8381f9SStefano Zampini   A->nonzerostate++;
2960*7e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2961*7e8381f9SStefano Zampini   {
2962*7e8381f9SStefano Zampini     matrix = (CsrMatrix*)cusp->mat->mat;
2963*7e8381f9SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
2964*7e8381f9SStefano Zampini     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
2965*7e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
2966*7e8381f9SStefano Zampini   }
2967*7e8381f9SStefano Zampini 
2968*7e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
2969*7e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
2970*7e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
2971*7e8381f9SStefano Zampini   PetscFunctionReturn(0);
2972*7e8381f9SStefano Zampini }
2973