xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision fcdce8c4deba716d0e3c51bd0759468eb68e474e)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
686fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
759ae82921SPaul Mullowney 
767f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
837e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
847e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
857e8381f9SStefano Zampini 
86b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
87b06137fdSPaul Mullowney {
88b06137fdSPaul Mullowney   cusparseStatus_t   stat;
89b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
90b06137fdSPaul Mullowney 
91b06137fdSPaul Mullowney   PetscFunctionBegin;
92d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
93b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
9457d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
95b06137fdSPaul Mullowney   PetscFunctionReturn(0);
96b06137fdSPaul Mullowney }
97b06137fdSPaul Mullowney 
98b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
99b06137fdSPaul Mullowney {
100b06137fdSPaul Mullowney   cusparseStatus_t   stat;
101b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102b06137fdSPaul Mullowney 
103b06137fdSPaul Mullowney   PetscFunctionBegin;
104d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1056b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10616a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
10757d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
10816a2e217SAlejandro Lamas Daviña     }
109b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1106b1cf21dSAlejandro Lamas Daviña   }
11157d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
112b06137fdSPaul Mullowney   PetscFunctionReturn(0);
113b06137fdSPaul Mullowney }
114b06137fdSPaul Mullowney 
115b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
116b06137fdSPaul Mullowney {
117b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1187e8381f9SStefano Zampini   PetscBool          flg;
1197e8381f9SStefano Zampini   PetscErrorCode     ierr;
120ccdfe979SStefano Zampini 
121b06137fdSPaul Mullowney   PetscFunctionBegin;
1227e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1237e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
124ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
125b06137fdSPaul Mullowney   PetscFunctionReturn(0);
126b06137fdSPaul Mullowney }
127b06137fdSPaul Mullowney 
128ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1299ae82921SPaul Mullowney {
1309ae82921SPaul Mullowney   PetscFunctionBegin;
1319ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1329ae82921SPaul Mullowney   PetscFunctionReturn(0);
1339ae82921SPaul Mullowney }
1349ae82921SPaul Mullowney 
135c708e6cdSJed Brown /*MC
136087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
137087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
138087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
139087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
140087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
141087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
142c708e6cdSJed Brown 
1439ae82921SPaul Mullowney   Level: beginner
144c708e6cdSJed Brown 
1453ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
146c708e6cdSJed Brown M*/
1479ae82921SPaul Mullowney 
14842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1499ae82921SPaul Mullowney {
1509ae82921SPaul Mullowney   PetscErrorCode ierr;
151bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1529ae82921SPaul Mullowney 
1539ae82921SPaul Mullowney   PetscFunctionBegin;
154bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
155bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1562c7c0729SBarry Smith   (*B)->factortype = ftype;
1572c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1589ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1592205254eSKarl Rupp 
160087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16133d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1629ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1639ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
164087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
165087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
166087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1679ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
168bc3f50f2SPaul Mullowney 
169fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1703ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1719ae82921SPaul Mullowney   PetscFunctionReturn(0);
1729ae82921SPaul Mullowney }
1739ae82921SPaul Mullowney 
174bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
175ca45077fSPaul Mullowney {
176aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1776e111a19SKarl Rupp 
178ca45077fSPaul Mullowney   PetscFunctionBegin;
179ca45077fSPaul Mullowney   switch (op) {
180e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
181aa372e3fSPaul Mullowney     cusparsestruct->format = format;
182ca45077fSPaul Mullowney     break;
183e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
184aa372e3fSPaul Mullowney     cusparsestruct->format = format;
185ca45077fSPaul Mullowney     break;
186ca45077fSPaul Mullowney   default:
18736d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
188ca45077fSPaul Mullowney   }
189ca45077fSPaul Mullowney   PetscFunctionReturn(0);
190ca45077fSPaul Mullowney }
1919ae82921SPaul Mullowney 
192e057df02SPaul Mullowney /*@
193e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
194e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
195aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
196e057df02SPaul Mullowney    Not Collective
197e057df02SPaul Mullowney 
198e057df02SPaul Mullowney    Input Parameters:
1998468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20036d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2012692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
202e057df02SPaul Mullowney 
203e057df02SPaul Mullowney    Output Parameter:
204e057df02SPaul Mullowney 
205e057df02SPaul Mullowney    Level: intermediate
206e057df02SPaul Mullowney 
2078468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
208e057df02SPaul Mullowney @*/
209e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
210e057df02SPaul Mullowney {
211e057df02SPaul Mullowney   PetscErrorCode ierr;
2126e111a19SKarl Rupp 
213e057df02SPaul Mullowney   PetscFunctionBegin;
214e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
215e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
216e057df02SPaul Mullowney   PetscFunctionReturn(0);
217e057df02SPaul Mullowney }
218e057df02SPaul Mullowney 
219e6e9a74fSStefano Zampini /*@
220e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
221e6e9a74fSStefano Zampini 
222e6e9a74fSStefano Zampini    Collective on mat
223e6e9a74fSStefano Zampini 
224e6e9a74fSStefano Zampini    Input Parameters:
225e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
226e6e9a74fSStefano Zampini -  transgen - the boolean flag
227e6e9a74fSStefano Zampini 
228e6e9a74fSStefano Zampini    Level: intermediate
229e6e9a74fSStefano Zampini 
230e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
231e6e9a74fSStefano Zampini @*/
232e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
233e6e9a74fSStefano Zampini {
234e6e9a74fSStefano Zampini   PetscErrorCode ierr;
235e6e9a74fSStefano Zampini   PetscBool      flg;
236e6e9a74fSStefano Zampini 
237e6e9a74fSStefano Zampini   PetscFunctionBegin;
238e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
239e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
240e6e9a74fSStefano Zampini   if (flg) {
241e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24254da937aSStefano Zampini 
243e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
244e6e9a74fSStefano Zampini     cusp->transgen = transgen;
24554da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
24654da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
24754da937aSStefano Zampini     }
248e6e9a74fSStefano Zampini   }
249e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
250e6e9a74fSStefano Zampini }
251e6e9a74fSStefano Zampini 
2524416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2539ae82921SPaul Mullowney {
2549ae82921SPaul Mullowney   PetscErrorCode           ierr;
255e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2569ae82921SPaul Mullowney   PetscBool                flg;
257a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2586e111a19SKarl Rupp 
2599ae82921SPaul Mullowney   PetscFunctionBegin;
260e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2619ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26254da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26354da937aSStefano Zampini 
26454da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
265afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
266afb2bd1cSJunchao Zhang 
267e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
268a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
269afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
270afb2bd1cSJunchao Zhang 
2714c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
272a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
273afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
274afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
275afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
276afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
277afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
278afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
279afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
280afb2bd1cSJunchao Zhang 
281afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
282afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
283afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
284afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
285afb2bd1cSJunchao Zhang 
286afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
287afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
288afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
289afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
290afb2bd1cSJunchao Zhang    #endif
2914c87dfd4SPaul Mullowney   }
2920af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2939ae82921SPaul Mullowney   PetscFunctionReturn(0);
2949ae82921SPaul Mullowney }
2959ae82921SPaul Mullowney 
2966fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2979ae82921SPaul Mullowney {
298da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2999ae82921SPaul Mullowney   PetscErrorCode               ierr;
3009ae82921SPaul Mullowney 
3019ae82921SPaul Mullowney   PetscFunctionBegin;
302da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3039ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3049ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3059ae82921SPaul Mullowney   PetscFunctionReturn(0);
3069ae82921SPaul Mullowney }
3079ae82921SPaul Mullowney 
3086fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3099ae82921SPaul Mullowney {
310da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3119ae82921SPaul Mullowney   PetscErrorCode               ierr;
3129ae82921SPaul Mullowney 
3139ae82921SPaul Mullowney   PetscFunctionBegin;
314da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3159ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3169ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3179ae82921SPaul Mullowney   PetscFunctionReturn(0);
3189ae82921SPaul Mullowney }
3199ae82921SPaul Mullowney 
320087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
321087f3262SPaul Mullowney {
322da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323087f3262SPaul Mullowney   PetscErrorCode               ierr;
324087f3262SPaul Mullowney 
325087f3262SPaul Mullowney   PetscFunctionBegin;
326da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
328087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
329087f3262SPaul Mullowney   PetscFunctionReturn(0);
330087f3262SPaul Mullowney }
331087f3262SPaul Mullowney 
332087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
333087f3262SPaul Mullowney {
334da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335087f3262SPaul Mullowney   PetscErrorCode               ierr;
336087f3262SPaul Mullowney 
337087f3262SPaul Mullowney   PetscFunctionBegin;
338da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
340087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
341087f3262SPaul Mullowney   PetscFunctionReturn(0);
342087f3262SPaul Mullowney }
343087f3262SPaul Mullowney 
344087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3459ae82921SPaul Mullowney {
3469ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3479ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3489ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
349aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3509ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3519ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3529ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3539ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3549ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
355b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
35657d48284SJunchao Zhang   cudaError_t                       cerr;
3579ae82921SPaul Mullowney 
3589ae82921SPaul Mullowney   PetscFunctionBegin;
359cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
360c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3619ae82921SPaul Mullowney     try {
3629ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3639ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
364da79fbbcSStefano Zampini       if (!loTriFactor) {
3652cbc15d9SMark         PetscScalar                       *AALo;
3662cbc15d9SMark 
3672cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3689ae82921SPaul Mullowney 
3699ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
37057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3749ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3759ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3769ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3779ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3789ae82921SPaul Mullowney         v        = aa;
3799ae82921SPaul Mullowney         vi       = aj;
3809ae82921SPaul Mullowney         offset   = 1;
3819ae82921SPaul Mullowney         rowOffset= 1;
3829ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3839ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
384e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3859ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3869ae82921SPaul Mullowney           rowOffset += nz+1;
3879ae82921SPaul Mullowney 
388580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
389580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3909ae82921SPaul Mullowney 
3919ae82921SPaul Mullowney           offset      += nz;
3929ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3939ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3949ae82921SPaul Mullowney           offset      += 1;
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney           v  += nz;
3979ae82921SPaul Mullowney           vi += nz;
3989ae82921SPaul Mullowney         }
3992205254eSKarl Rupp 
400aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
401da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
402da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
403aa372e3fSPaul Mullowney         /* Create the matrix description */
40457d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
40557d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4061b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
407afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
408afb2bd1cSJunchao Zhang        #else
40957d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
410afb2bd1cSJunchao Zhang        #endif
41157d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41257d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
413aa372e3fSPaul Mullowney 
414aa372e3fSPaul Mullowney         /* set the operation */
415aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
416aa372e3fSPaul Mullowney 
417aa372e3fSPaul Mullowney         /* set the matrix */
418aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
419aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
420aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
421aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
422aa372e3fSPaul Mullowney 
423aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
424aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
425aa372e3fSPaul Mullowney 
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
427aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
428aa372e3fSPaul Mullowney 
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
430aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
431aa372e3fSPaul Mullowney 
432afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
433da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
434afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4351b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
436afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
437afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
438afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
439afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
440afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
441afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
442afb2bd1cSJunchao Zhang       #endif
443afb2bd1cSJunchao Zhang 
444aa372e3fSPaul Mullowney         /* perform the solve analysis */
445aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
446aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
447aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
448afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4491b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
450afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
451afb2bd1cSJunchao Zhang                                #endif
452afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
453da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
454da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
455aa372e3fSPaul Mullowney 
456da79fbbcSStefano Zampini         /* assign the pointer */
457aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4582cbc15d9SMark         loTriFactor->AA_h = AALo;
45957d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
46057d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4614863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
462da79fbbcSStefano Zampini       } else { /* update values only */
4632cbc15d9SMark         if (!loTriFactor->AA_h) {
4642cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4652cbc15d9SMark         }
466da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4672cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
468da79fbbcSStefano Zampini         v        = aa;
469da79fbbcSStefano Zampini         vi       = aj;
470da79fbbcSStefano Zampini         offset   = 1;
471da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
472da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4732cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
474da79fbbcSStefano Zampini           offset      += nz;
4752cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
476da79fbbcSStefano Zampini           offset      += 1;
477da79fbbcSStefano Zampini           v  += nz;
478da79fbbcSStefano Zampini         }
4792cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
480da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
481da79fbbcSStefano Zampini       }
4829ae82921SPaul Mullowney     } catch(char *ex) {
4839ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4849ae82921SPaul Mullowney     }
4859ae82921SPaul Mullowney   }
4869ae82921SPaul Mullowney   PetscFunctionReturn(0);
4879ae82921SPaul Mullowney }
4889ae82921SPaul Mullowney 
489087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4909ae82921SPaul Mullowney {
4919ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4929ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4939ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
494aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4959ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4969ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4979ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4989ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4999ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5009ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50157d48284SJunchao Zhang   cudaError_t                       cerr;
5029ae82921SPaul Mullowney 
5039ae82921SPaul Mullowney   PetscFunctionBegin;
504cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
505c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5069ae82921SPaul Mullowney     try {
5079ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5089ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
509da79fbbcSStefano Zampini       if (!upTriFactor) {
5102cbc15d9SMark         PetscScalar *AAUp;
5112cbc15d9SMark 
5122cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5132cbc15d9SMark 
5149ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
51557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
51657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5179ae82921SPaul Mullowney 
5189ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5199ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5209ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5219ae82921SPaul Mullowney         offset = nzUpper;
5229ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5239ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5249ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5259ae82921SPaul Mullowney 
526e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5279ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5289ae82921SPaul Mullowney 
529e057df02SPaul Mullowney           /* decrement the offset */
5309ae82921SPaul Mullowney           offset -= (nz+1);
5319ae82921SPaul Mullowney 
532e057df02SPaul Mullowney           /* first, set the diagonal elements */
5339ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
53409f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5359ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5369ae82921SPaul Mullowney 
537580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
538580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5399ae82921SPaul Mullowney         }
5402205254eSKarl Rupp 
541aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
542da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
543da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5442205254eSKarl Rupp 
545aa372e3fSPaul Mullowney         /* Create the matrix description */
54657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
54757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5481b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
549afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
550afb2bd1cSJunchao Zhang        #else
55157d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
552afb2bd1cSJunchao Zhang        #endif
55357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
55457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
555aa372e3fSPaul Mullowney 
556aa372e3fSPaul Mullowney         /* set the operation */
557aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
558aa372e3fSPaul Mullowney 
559aa372e3fSPaul Mullowney         /* set the matrix */
560aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
561aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
562aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
563aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
564aa372e3fSPaul Mullowney 
565aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
566aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
567aa372e3fSPaul Mullowney 
568aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
569aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
570aa372e3fSPaul Mullowney 
571aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
572aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
573aa372e3fSPaul Mullowney 
574afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
575da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
576afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5771b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
578afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
579afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
580afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
581afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
582afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
583afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
584afb2bd1cSJunchao Zhang       #endif
585afb2bd1cSJunchao Zhang 
586aa372e3fSPaul Mullowney         /* perform the solve analysis */
587aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
588aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
589aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
590afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5911b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
592afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
593afb2bd1cSJunchao Zhang                                #endif
594afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
595da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
596da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
597aa372e3fSPaul Mullowney 
598da79fbbcSStefano Zampini         /* assign the pointer */
599aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6002cbc15d9SMark         upTriFactor->AA_h = AAUp;
60157d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
60257d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6034863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
604da79fbbcSStefano Zampini       } else {
6052cbc15d9SMark         if (!upTriFactor->AA_h) {
6062cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6072cbc15d9SMark         }
608da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
609da79fbbcSStefano Zampini         offset = nzUpper;
610da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
611da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
612da79fbbcSStefano Zampini 
613da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
614da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
615da79fbbcSStefano Zampini 
616da79fbbcSStefano Zampini           /* decrement the offset */
617da79fbbcSStefano Zampini           offset -= (nz+1);
618da79fbbcSStefano Zampini 
619da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6202cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6212cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
622da79fbbcSStefano Zampini         }
6232cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
624da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
625da79fbbcSStefano Zampini       }
6269ae82921SPaul Mullowney     } catch(char *ex) {
6279ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6289ae82921SPaul Mullowney     }
6299ae82921SPaul Mullowney   }
6309ae82921SPaul Mullowney   PetscFunctionReturn(0);
6319ae82921SPaul Mullowney }
6329ae82921SPaul Mullowney 
633087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6349ae82921SPaul Mullowney {
6359ae82921SPaul Mullowney   PetscErrorCode               ierr;
6369ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6379ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6389ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6399ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6409ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6419ae82921SPaul Mullowney 
6429ae82921SPaul Mullowney   PetscFunctionBegin;
643da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
644087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
645087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6462205254eSKarl Rupp 
647da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
648aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6499ae82921SPaul Mullowney 
650c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
651e057df02SPaul Mullowney   /* lower triangular indices */
6529ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
653da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
654da79fbbcSStefano Zampini     const PetscInt *r;
655da79fbbcSStefano Zampini 
656da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
657aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
658aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6599ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
660da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
661da79fbbcSStefano Zampini   }
6629ae82921SPaul Mullowney 
663e057df02SPaul Mullowney   /* upper triangular indices */
6649ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
665da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
666da79fbbcSStefano Zampini     const PetscInt *c;
667da79fbbcSStefano Zampini 
668da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
669aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
670aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6719ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
672da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
673da79fbbcSStefano Zampini   }
6749ae82921SPaul Mullowney   PetscFunctionReturn(0);
6759ae82921SPaul Mullowney }
6769ae82921SPaul Mullowney 
677087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
678087f3262SPaul Mullowney {
679087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
680087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
681aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
682aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
683087f3262SPaul Mullowney   cusparseStatus_t                  stat;
684087f3262SPaul Mullowney   PetscErrorCode                    ierr;
68557d48284SJunchao Zhang   cudaError_t                       cerr;
686087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
687087f3262SPaul Mullowney   PetscScalar                       *AAUp;
688087f3262SPaul Mullowney   PetscScalar                       *AALo;
689087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
690087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
691087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
692087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
693087f3262SPaul Mullowney 
694087f3262SPaul Mullowney   PetscFunctionBegin;
695cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
696c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
697087f3262SPaul Mullowney     try {
698da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
699da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
700da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
701087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
70257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
70357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
704087f3262SPaul Mullowney 
705087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
706087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
707087f3262SPaul Mullowney         AiUp[n]=nzUpper;
708087f3262SPaul Mullowney         offset = 0;
709087f3262SPaul Mullowney         for (i=0; i<n; i++) {
710087f3262SPaul Mullowney           /* set the pointers */
711087f3262SPaul Mullowney           v  = aa + ai[i];
712087f3262SPaul Mullowney           vj = aj + ai[i];
713087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
714087f3262SPaul Mullowney 
715087f3262SPaul Mullowney           /* first, set the diagonal elements */
716087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
71709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
718087f3262SPaul Mullowney           AiUp[i]      = offset;
71909f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
720087f3262SPaul Mullowney 
721087f3262SPaul Mullowney           offset+=1;
722087f3262SPaul Mullowney           if (nz>0) {
723f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
724580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
725087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
726087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
727087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
728087f3262SPaul Mullowney             }
729087f3262SPaul Mullowney             offset+=nz;
730087f3262SPaul Mullowney           }
731087f3262SPaul Mullowney         }
732087f3262SPaul Mullowney 
733aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
734da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
735da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
736087f3262SPaul Mullowney 
737aa372e3fSPaul Mullowney         /* Create the matrix description */
73857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
73957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7401b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
741afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
742afb2bd1cSJunchao Zhang        #else
74357d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
744afb2bd1cSJunchao Zhang        #endif
74557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
74657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
747087f3262SPaul Mullowney 
748aa372e3fSPaul Mullowney         /* set the matrix */
749aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
750aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
751aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
752aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
753aa372e3fSPaul Mullowney 
754aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
755aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
756aa372e3fSPaul Mullowney 
757aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
758aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
759aa372e3fSPaul Mullowney 
760aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
761aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
762aa372e3fSPaul Mullowney 
763afb2bd1cSJunchao Zhang         /* set the operation */
764afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
765afb2bd1cSJunchao Zhang 
766afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
767da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
768afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7691b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
770afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
771afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
772afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
773afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
774afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
775afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
776afb2bd1cSJunchao Zhang       #endif
777afb2bd1cSJunchao Zhang 
778aa372e3fSPaul Mullowney         /* perform the solve analysis */
779aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
780aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
781aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
782afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7831b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
784afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
785afb2bd1cSJunchao Zhang                                 #endif
786afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
787da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
788da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
789aa372e3fSPaul Mullowney 
790da79fbbcSStefano Zampini         /* assign the pointer */
791aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
792aa372e3fSPaul Mullowney 
793aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
794da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
795da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
796aa372e3fSPaul Mullowney 
797aa372e3fSPaul Mullowney         /* Create the matrix description */
79857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
79957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8001b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
801afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
802afb2bd1cSJunchao Zhang        #else
80357d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
804afb2bd1cSJunchao Zhang        #endif
80557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
80657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
807aa372e3fSPaul Mullowney 
808aa372e3fSPaul Mullowney         /* set the operation */
809aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
810aa372e3fSPaul Mullowney 
811aa372e3fSPaul Mullowney         /* set the matrix */
812aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
813aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
814aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
815aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
816aa372e3fSPaul Mullowney 
817aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
818aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
819aa372e3fSPaul Mullowney 
820aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
821aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
822aa372e3fSPaul Mullowney 
823aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
824aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
825aa372e3fSPaul Mullowney 
826afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
827da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
828afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8291b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
830afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
831afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
832afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
833afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
834afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
835afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
836afb2bd1cSJunchao Zhang       #endif
837afb2bd1cSJunchao Zhang 
838aa372e3fSPaul Mullowney         /* perform the solve analysis */
839aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
840aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
841aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
842afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8431b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
844afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
845afb2bd1cSJunchao Zhang                                 #endif
846afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
847da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
848da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
849aa372e3fSPaul Mullowney 
850da79fbbcSStefano Zampini         /* assign the pointer */
851aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
852087f3262SPaul Mullowney 
853da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
85457d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
85557d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
856da79fbbcSStefano Zampini       } else {
857da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
858da79fbbcSStefano Zampini         offset = 0;
859da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
860da79fbbcSStefano Zampini           /* set the pointers */
861da79fbbcSStefano Zampini           v  = aa + ai[i];
862da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
863da79fbbcSStefano Zampini 
864da79fbbcSStefano Zampini           /* first, set the diagonal elements */
865da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
866da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
867da79fbbcSStefano Zampini 
868da79fbbcSStefano Zampini           offset+=1;
869da79fbbcSStefano Zampini           if (nz>0) {
870da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
871da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
872da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
873da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
874da79fbbcSStefano Zampini             }
875da79fbbcSStefano Zampini             offset+=nz;
876da79fbbcSStefano Zampini           }
877da79fbbcSStefano Zampini         }
878da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
879da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
880da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
881da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
882da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
883da79fbbcSStefano Zampini       }
88457d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
88557d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
886087f3262SPaul Mullowney     } catch(char *ex) {
887087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
888087f3262SPaul Mullowney     }
889087f3262SPaul Mullowney   }
890087f3262SPaul Mullowney   PetscFunctionReturn(0);
891087f3262SPaul Mullowney }
892087f3262SPaul Mullowney 
893087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8949ae82921SPaul Mullowney {
8959ae82921SPaul Mullowney   PetscErrorCode               ierr;
896087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
897087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
898087f3262SPaul Mullowney   IS                           ip = a->row;
899087f3262SPaul Mullowney   PetscBool                    perm_identity;
900087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
901087f3262SPaul Mullowney 
902087f3262SPaul Mullowney   PetscFunctionBegin;
903da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
904087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
905da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
906aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
907aa372e3fSPaul Mullowney 
908da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
909da79fbbcSStefano Zampini 
910087f3262SPaul Mullowney   /* lower triangular indices */
911087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
912087f3262SPaul Mullowney   if (!perm_identity) {
9134e4bbfaaSStefano Zampini     IS             iip;
914da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9154e4bbfaaSStefano Zampini 
9164e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9174e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
918da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
919aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
920aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
921aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9224e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9234e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9244e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
925087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
926da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
927da79fbbcSStefano Zampini   }
928087f3262SPaul Mullowney   PetscFunctionReturn(0);
929087f3262SPaul Mullowney }
930087f3262SPaul Mullowney 
9316fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9329ae82921SPaul Mullowney {
9339ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9349ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9359ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
936b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9379ae82921SPaul Mullowney 
9389ae82921SPaul Mullowney   PetscFunctionBegin;
9399ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
940ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
941e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9429ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9439ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
944bda325fcSPaul Mullowney   if (row_identity && col_identity) {
945bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
946bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9474e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9484e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
949bda325fcSPaul Mullowney   } else {
950bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
951bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9524e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9534e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
954bda325fcSPaul Mullowney   }
9558dc1d2a3SPaul Mullowney 
956e057df02SPaul Mullowney   /* get the triangular factors */
957087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9589ae82921SPaul Mullowney   PetscFunctionReturn(0);
9599ae82921SPaul Mullowney }
9609ae82921SPaul Mullowney 
961087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
962087f3262SPaul Mullowney {
963087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
964087f3262SPaul Mullowney   IS             ip = b->row;
965087f3262SPaul Mullowney   PetscBool      perm_identity;
966b175d8bbSPaul Mullowney   PetscErrorCode ierr;
967087f3262SPaul Mullowney 
968087f3262SPaul Mullowney   PetscFunctionBegin;
969087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
970ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
971087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
972087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
973087f3262SPaul Mullowney   if (perm_identity) {
974087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
975087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9764e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9774e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
978087f3262SPaul Mullowney   } else {
979087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
980087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9814e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9824e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
983087f3262SPaul Mullowney   }
984087f3262SPaul Mullowney 
985087f3262SPaul Mullowney   /* get the triangular factors */
986087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
987087f3262SPaul Mullowney   PetscFunctionReturn(0);
988087f3262SPaul Mullowney }
9899ae82921SPaul Mullowney 
990b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
991bda325fcSPaul Mullowney {
992bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
993aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
994aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
995da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
996da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
997bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
998aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
999aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1000aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1001aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10021b0a6780SStefano Zampini   cudaError_t                       cerr;
1003da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1004b175d8bbSPaul Mullowney 
1005bda325fcSPaul Mullowney   PetscFunctionBegin;
1006aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1007da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1008da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1009aa372e3fSPaul Mullowney 
1010aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1011aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1012aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1013aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1014aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1015aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* Create the matrix description */
101857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
101957d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102057d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102157d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102257d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1023aa372e3fSPaul Mullowney 
1024aa372e3fSPaul Mullowney   /* set the operation */
1025aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1026aa372e3fSPaul Mullowney 
1027aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1028aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1029afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1030afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1031aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1032afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1033afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1034afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1035aa372e3fSPaul Mullowney 
1036aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1037afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1038afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1041afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1042afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1043afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1044afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1045afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1046afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10471b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1048afb2bd1cSJunchao Zhang #endif
1049afb2bd1cSJunchao Zhang 
1050da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1051aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1052aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1053aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1054aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1055aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1056aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1057afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1058afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1059afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1060afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1061afb2bd1cSJunchao Zhang                         #else
1062afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1064afb2bd1cSJunchao Zhang                         #endif
1065afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1066da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1067da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1068aa372e3fSPaul Mullowney 
1069afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1070da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1071afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1073afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1074afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1075afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1076afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1077afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1078afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1079afb2bd1cSJunchao Zhang #endif
1080afb2bd1cSJunchao Zhang 
1081afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1082aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1083afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1084afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1085afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10861b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1087afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1088afb2bd1cSJunchao Zhang                           #endif
1089afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1090da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1091da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1092aa372e3fSPaul Mullowney 
1093da79fbbcSStefano Zampini   /* assign the pointer */
1094aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1095aa372e3fSPaul Mullowney 
1096aa372e3fSPaul Mullowney   /*********************************************/
1097aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1098aa372e3fSPaul Mullowney   /*********************************************/
1099aa372e3fSPaul Mullowney 
1100aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1101da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1102da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1103aa372e3fSPaul Mullowney 
1104aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1105aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1106aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1107aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1108aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1109aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1110aa372e3fSPaul Mullowney 
1111aa372e3fSPaul Mullowney   /* Create the matrix description */
111257d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
111357d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
111457d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
111557d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
111657d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1117aa372e3fSPaul Mullowney 
1118aa372e3fSPaul Mullowney   /* set the operation */
1119aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1120aa372e3fSPaul Mullowney 
1121aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1122aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1123afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1124afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1125aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1126afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1127afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1128afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1129aa372e3fSPaul Mullowney 
1130aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1131afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1132afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1133afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1134afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1135afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1136afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1137afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1138afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1139afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1140afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1141afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1142afb2bd1cSJunchao Zhang #endif
1143afb2bd1cSJunchao Zhang 
1144da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1145aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1146aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1147aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1148aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1149aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1150aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1151afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1152afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1153afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1154afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1155afb2bd1cSJunchao Zhang                         #else
1156afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1157afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1158afb2bd1cSJunchao Zhang                         #endif
1159afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1160da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1161da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1162aa372e3fSPaul Mullowney 
1163afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1164da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1165afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11661b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1167afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1168afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1169afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1170afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1171afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1172afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1173afb2bd1cSJunchao Zhang   #endif
1174afb2bd1cSJunchao Zhang 
1175afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1176aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1177afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1178afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1179afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11801b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1181afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1182afb2bd1cSJunchao Zhang                           #endif
1183afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1184da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1185da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1186aa372e3fSPaul Mullowney 
1187da79fbbcSStefano Zampini   /* assign the pointer */
1188aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1189bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1190bda325fcSPaul Mullowney }
1191bda325fcSPaul Mullowney 
1192b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1193bda325fcSPaul Mullowney {
1194aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1195aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1196aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1197bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1198bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1199aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1200b06137fdSPaul Mullowney   cudaError_t                  err;
120185ba7357SStefano Zampini   PetscErrorCode               ierr;
1202b175d8bbSPaul Mullowney 
1203bda325fcSPaul Mullowney   PetscFunctionBegin;
1204*fcdce8c4SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
120585ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
120685ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
120785ba7357SStefano Zampini   /* create cusparse matrix */
1208aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
120957d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1210aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
121157d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
121257d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1213aa372e3fSPaul Mullowney 
1214b06137fdSPaul Mullowney   /* set alpha and beta */
1215afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12167656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12177656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1218afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12197656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12207656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
122157d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1222b06137fdSPaul Mullowney 
1223aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1224aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1225aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1226554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1227554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1228aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1229a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1230aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1231aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1232a3fdcf43SKarl Rupp 
123381902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
123481902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1235afb2bd1cSJunchao Zhang 
123681902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1237afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1238afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1239afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1240afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1241afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1242afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1243afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1244afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1245afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1246afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1247afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1248afb2bd1cSJunchao Zhang    #endif
1249afb2bd1cSJunchao Zhang 
1250a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1251a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1252aa372e3fSPaul Mullowney                             matrix->values->data().get(),
125381902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1254aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1255aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1256afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1257afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1258afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1259afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1260afb2bd1cSJunchao Zhang                           #else
1261afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1262afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1263afb2bd1cSJunchao Zhang                           #endif
1264afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1265aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1266afb2bd1cSJunchao Zhang 
1267afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1268afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1269afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1270afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1271afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1272afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1273afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1274afb2bd1cSJunchao Zhang    #endif
1275aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1276afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1277afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1278afb2bd1cSJunchao Zhang    #else
1279aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
128051c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
128151c6d536SStefano Zampini     /* First convert HYB to CSR */
1282aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1283aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1284aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1285aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1286aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1287aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1288aa372e3fSPaul Mullowney 
1289aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1290aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1291aa372e3fSPaul Mullowney                             temp->values->data().get(),
1292aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
129357d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1294aa372e3fSPaul Mullowney 
1295aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1296aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1297aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1298aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1299aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1300aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1301aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1302aa372e3fSPaul Mullowney 
1303aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1304aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1305aa372e3fSPaul Mullowney                             temp->values->data().get(),
1306aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1307aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1308aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1309aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1310aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
131157d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1312aa372e3fSPaul Mullowney 
1313aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1314aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
131557d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1316aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1317aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1318aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1319aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1320aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1321aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
132257d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1323aa372e3fSPaul Mullowney 
1324aa372e3fSPaul Mullowney     /* assign the pointer */
1325aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1326aa372e3fSPaul Mullowney     /* delete temporaries */
1327aa372e3fSPaul Mullowney     if (tempT) {
1328aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1329aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1330aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1331aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1332087f3262SPaul Mullowney     }
1333aa372e3fSPaul Mullowney     if (temp) {
1334aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1335aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1336aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1337aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1338aa372e3fSPaul Mullowney     }
1339afb2bd1cSJunchao Zhang    #endif
1340aa372e3fSPaul Mullowney   }
134105035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
134285ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
134385ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1344213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1345213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1346aa372e3fSPaul Mullowney   /* assign the pointer */
1347aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1348bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1349bda325fcSPaul Mullowney }
1350bda325fcSPaul Mullowney 
13514e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13526fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1353bda325fcSPaul Mullowney {
1354c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1355465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1356465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1357465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1358465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1359bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1360bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1361aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1362aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1363aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1364b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
136557d48284SJunchao Zhang   cudaError_t                           cerr;
1366bda325fcSPaul Mullowney 
1367bda325fcSPaul Mullowney   PetscFunctionBegin;
1368aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1369aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1370bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1371aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1372aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1373bda325fcSPaul Mullowney   }
1374bda325fcSPaul Mullowney 
1375bda325fcSPaul Mullowney   /* Get the GPU pointers */
1376c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1377c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1378c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1379c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1380bda325fcSPaul Mullowney 
13817a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1382aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1383c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1384c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1385c41cb2e2SAlejandro Lamas Daviña                xGPU);
1386aa372e3fSPaul Mullowney 
1387aa372e3fSPaul Mullowney   /* First, solve U */
1388aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1389afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13901b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1391afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1392afb2bd1cSJunchao Zhang                       #endif
1393afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1394aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1395aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1396aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1397aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1398afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
13991b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1400afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1401afb2bd1cSJunchao Zhang                       #endif
1402afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1403aa372e3fSPaul Mullowney 
1404aa372e3fSPaul Mullowney   /* Then, solve L */
1405aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1406afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14071b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1408afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1409afb2bd1cSJunchao Zhang                       #endif
1410afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1411aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1412aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1413aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1414aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1415afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14161b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1417afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1418afb2bd1cSJunchao Zhang                       #endif
1419afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1420aa372e3fSPaul Mullowney 
1421aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1422c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1423c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1424aa372e3fSPaul Mullowney                tempGPU->begin());
1425aa372e3fSPaul Mullowney 
1426aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1427c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1428bda325fcSPaul Mullowney 
1429bda325fcSPaul Mullowney   /* restore */
1430c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1431c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
143205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1433661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1434958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1435bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1436bda325fcSPaul Mullowney }
1437bda325fcSPaul Mullowney 
14386fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1439bda325fcSPaul Mullowney {
1440465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1441465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1442bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1443bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1444aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1445aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1446aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1447b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
144857d48284SJunchao Zhang   cudaError_t                       cerr;
1449bda325fcSPaul Mullowney 
1450bda325fcSPaul Mullowney   PetscFunctionBegin;
1451aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1452aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1453bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1454aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1455aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1456bda325fcSPaul Mullowney   }
1457bda325fcSPaul Mullowney 
1458bda325fcSPaul Mullowney   /* Get the GPU pointers */
1459c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1460c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1461bda325fcSPaul Mullowney 
14627a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1463aa372e3fSPaul Mullowney   /* First, solve U */
1464aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1465afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14661b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1467afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1468afb2bd1cSJunchao Zhang                       #endif
1469afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1470aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1471aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1472aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1473aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1474afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
14751b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1476afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1477afb2bd1cSJunchao Zhang                       #endif
1478afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1479aa372e3fSPaul Mullowney 
1480aa372e3fSPaul Mullowney   /* Then, solve L */
1481aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1482afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14831b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1484afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1485afb2bd1cSJunchao Zhang                       #endif
1486afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1487aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1488aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1489aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1490aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1491afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14921b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1493afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1494afb2bd1cSJunchao Zhang                       #endif
1495afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1496bda325fcSPaul Mullowney 
1497bda325fcSPaul Mullowney   /* restore */
1498c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1499c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
150005035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1501661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1502958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1503bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1504bda325fcSPaul Mullowney }
1505bda325fcSPaul Mullowney 
15066fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15079ae82921SPaul Mullowney {
1508465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1509465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1510465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1511465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15129ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15139ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1514aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1515aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1516aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1517b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
151857d48284SJunchao Zhang   cudaError_t                           cerr;
15199ae82921SPaul Mullowney 
15209ae82921SPaul Mullowney   PetscFunctionBegin;
1521ebc8f436SDominic Meiser 
1522e057df02SPaul Mullowney   /* Get the GPU pointers */
1523c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1524c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1525c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1526c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15279ae82921SPaul Mullowney 
15287a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1529aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1530c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1531c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15324e4bbfaaSStefano Zampini                tempGPU->begin());
1533aa372e3fSPaul Mullowney 
1534aa372e3fSPaul Mullowney   /* Next, solve L */
1535aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1536afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15371b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1538afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1539afb2bd1cSJunchao Zhang                       #endif
1540afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1541aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1542aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1543aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1544aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1545afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15461b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1548afb2bd1cSJunchao Zhang                       #endif
1549afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1550aa372e3fSPaul Mullowney 
1551aa372e3fSPaul Mullowney   /* Then, solve U */
1552aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1553afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15541b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1556afb2bd1cSJunchao Zhang                       #endif
1557afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1558aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1559aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1560aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1561aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1562afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
15631b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1565afb2bd1cSJunchao Zhang                       #endif
1566afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1567aa372e3fSPaul Mullowney 
15684e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
15694e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15704e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15714e4bbfaaSStefano Zampini                xGPU);
15729ae82921SPaul Mullowney 
1573c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1574c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
157505035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1576661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1577958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15789ae82921SPaul Mullowney   PetscFunctionReturn(0);
15799ae82921SPaul Mullowney }
15809ae82921SPaul Mullowney 
15816fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15829ae82921SPaul Mullowney {
1583465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1584465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15859ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15869ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1587aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1588aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1589aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1590b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
159157d48284SJunchao Zhang   cudaError_t                       cerr;
15929ae82921SPaul Mullowney 
15939ae82921SPaul Mullowney   PetscFunctionBegin;
1594e057df02SPaul Mullowney   /* Get the GPU pointers */
1595c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1596c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
15979ae82921SPaul Mullowney 
15987a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1599aa372e3fSPaul Mullowney   /* First, solve L */
1600aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1601afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16021b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1603afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1604afb2bd1cSJunchao Zhang                       #endif
1605afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1606aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1607aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1608aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1609aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1610afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16111b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1613afb2bd1cSJunchao Zhang                       #endif
1614afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1615aa372e3fSPaul Mullowney 
1616aa372e3fSPaul Mullowney   /* Next, solve U */
1617aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1618afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16191b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1620afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1621afb2bd1cSJunchao Zhang                       #endif
1622afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1623aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1624aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1625aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1626aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1627afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16281b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1629afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1630afb2bd1cSJunchao Zhang                       #endif
1631afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16329ae82921SPaul Mullowney 
1633c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1634c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
163505035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1636661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1637958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16389ae82921SPaul Mullowney   PetscFunctionReturn(0);
16399ae82921SPaul Mullowney }
16409ae82921SPaul Mullowney 
16417e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16427e8381f9SStefano Zampini {
16437e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16447e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16457e8381f9SStefano Zampini   cudaError_t        cerr;
16467e8381f9SStefano Zampini   PetscErrorCode     ierr;
16477e8381f9SStefano Zampini 
16487e8381f9SStefano Zampini   PetscFunctionBegin;
16497e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16507e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16517e8381f9SStefano Zampini 
16527e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16537e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16547e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16557e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
16567e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16577e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
16587e8381f9SStefano Zampini   }
16597e8381f9SStefano Zampini   PetscFunctionReturn(0);
16607e8381f9SStefano Zampini }
16617e8381f9SStefano Zampini 
16627e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
16637e8381f9SStefano Zampini {
16647e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
16657e8381f9SStefano Zampini   PetscErrorCode ierr;
16667e8381f9SStefano Zampini 
16677e8381f9SStefano Zampini   PetscFunctionBegin;
16687e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
16697e8381f9SStefano Zampini   *array = a->a;
16707e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
16717e8381f9SStefano Zampini   PetscFunctionReturn(0);
16727e8381f9SStefano Zampini }
16737e8381f9SStefano Zampini 
16746fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
16759ae82921SPaul Mullowney {
1676aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
16777c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
16789ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1679213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
16809ae82921SPaul Mullowney   PetscErrorCode               ierr;
1681aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1682b06137fdSPaul Mullowney   cudaError_t                  err;
16839ae82921SPaul Mullowney 
16849ae82921SPaul Mullowney   PetscFunctionBegin;
1685*fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1686c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1687*fcdce8c4SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
168881902715SJunchao Zhang       /* Copy values only */
1689afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1690afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
169185ba7357SStefano Zampini 
169285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1693afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
169405035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
16954863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
169685ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
169781902715SJunchao Zhang 
169881902715SJunchao Zhang       /* Update matT when it was built before */
169981902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
170081902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1701afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
170285ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
170381902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1704afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1705afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
170681902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1707afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1708afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1709afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1710afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1711afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1712afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1713afb2bd1cSJunchao Zhang                           #else
1714afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1715afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1716afb2bd1cSJunchao Zhang                           #endif
1717afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
171805035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
171985ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
172081902715SJunchao Zhang       }
172134d6c7a5SJose E. Roman     } else {
172285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17237c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
17247c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
17257c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
172681902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
17279ae82921SPaul Mullowney       try {
17289ae82921SPaul Mullowney         if (a->compressedrow.use) {
17299ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17309ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17319ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17329ae82921SPaul Mullowney         } else {
1733213423ffSJunchao Zhang           m    = A->rmap->n;
1734213423ffSJunchao Zhang           ii   = a->i;
1735e6e9a74fSStefano Zampini           ridx = NULL;
17369ae82921SPaul Mullowney         }
1737213423ffSJunchao Zhang         cusparsestruct->nrows = m;
17389ae82921SPaul Mullowney 
173985ba7357SStefano Zampini         /* create cusparse matrix */
1740aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
174157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
174257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
174357d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17449ae82921SPaul Mullowney 
1745afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17467656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17477656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1748afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17497656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17507656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
175157d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1752b06137fdSPaul Mullowney 
1753aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1754aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1755aa372e3fSPaul Mullowney           /* set the matrix */
1756afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1757afb2bd1cSJunchao Zhang           mat->num_rows = m;
1758afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1759afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1760afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1761afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17629ae82921SPaul Mullowney 
1763afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1764afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1765aa372e3fSPaul Mullowney 
1766afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1767afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1768aa372e3fSPaul Mullowney 
1769aa372e3fSPaul Mullowney           /* assign the pointer */
1770afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1771afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1772afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1773afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1774afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1775afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1776afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1777afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1778afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1779afb2bd1cSJunchao Zhang           }
1780afb2bd1cSJunchao Zhang          #endif
1781aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1782afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1783afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1784afb2bd1cSJunchao Zhang          #else
1785afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1786afb2bd1cSJunchao Zhang           mat->num_rows = m;
1787afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1788afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1789afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1790afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1791aa372e3fSPaul Mullowney 
1792afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1793afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1794aa372e3fSPaul Mullowney 
1795afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1796afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1797aa372e3fSPaul Mullowney 
1798aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
179957d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1800aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1801aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1802afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1803afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1804afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1805afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
180657d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1807aa372e3fSPaul Mullowney           /* assign the pointer */
1808aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1809aa372e3fSPaul Mullowney 
1810afb2bd1cSJunchao Zhang           if (mat) {
1811afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1812afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1813afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1814afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1815087f3262SPaul Mullowney           }
1816afb2bd1cSJunchao Zhang          #endif
1817087f3262SPaul Mullowney         }
1818ca45077fSPaul Mullowney 
1819aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1820213423ffSJunchao Zhang         if (a->compressedrow.use) {
1821213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1822aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1823aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1824213423ffSJunchao Zhang           tmp = m;
1825213423ffSJunchao Zhang         } else {
1826213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1827213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1828213423ffSJunchao Zhang           tmp = 0;
1829213423ffSJunchao Zhang         }
1830213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1831aa372e3fSPaul Mullowney 
1832aa372e3fSPaul Mullowney         /* assign the pointer */
1833aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18349ae82921SPaul Mullowney       } catch(char *ex) {
18359ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18369ae82921SPaul Mullowney       }
183705035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
183885ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
183934d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
184034d6c7a5SJose E. Roman     }
1841c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
18429ae82921SPaul Mullowney   }
18439ae82921SPaul Mullowney   PetscFunctionReturn(0);
18449ae82921SPaul Mullowney }
18459ae82921SPaul Mullowney 
1846c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1847aa372e3fSPaul Mullowney {
1848aa372e3fSPaul Mullowney   template <typename Tuple>
1849aa372e3fSPaul Mullowney   __host__ __device__
1850aa372e3fSPaul Mullowney   void operator()(Tuple t)
1851aa372e3fSPaul Mullowney   {
1852aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1853aa372e3fSPaul Mullowney   }
1854aa372e3fSPaul Mullowney };
1855aa372e3fSPaul Mullowney 
18567e8381f9SStefano Zampini struct VecCUDAEquals
18577e8381f9SStefano Zampini {
18587e8381f9SStefano Zampini   template <typename Tuple>
18597e8381f9SStefano Zampini   __host__ __device__
18607e8381f9SStefano Zampini   void operator()(Tuple t)
18617e8381f9SStefano Zampini   {
18627e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18637e8381f9SStefano Zampini   }
18647e8381f9SStefano Zampini };
18657e8381f9SStefano Zampini 
1866e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1867e6e9a74fSStefano Zampini {
1868e6e9a74fSStefano Zampini   template <typename Tuple>
1869e6e9a74fSStefano Zampini   __host__ __device__
1870e6e9a74fSStefano Zampini   void operator()(Tuple t)
1871e6e9a74fSStefano Zampini   {
1872e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1873e6e9a74fSStefano Zampini   }
1874e6e9a74fSStefano Zampini };
1875e6e9a74fSStefano Zampini 
1876afb2bd1cSJunchao Zhang struct MatMatCusparse {
1877ccdfe979SStefano Zampini   PetscBool             cisdense;
1878ccdfe979SStefano Zampini   PetscScalar           *Bt;
1879ccdfe979SStefano Zampini   Mat                   X;
1880*fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1881*fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1882*fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1883afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1884*fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1885afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1886afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1887afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1888afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1889*fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1890*fcdce8c4SStefano Zampini   void                  *mmBuffer;
1891*fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1892*fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1893afb2bd1cSJunchao Zhang #endif
1894afb2bd1cSJunchao Zhang };
1895ccdfe979SStefano Zampini 
1896ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1897ccdfe979SStefano Zampini {
1898ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1899ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1900ccdfe979SStefano Zampini   cudaError_t      cerr;
1901*fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1902*fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1903*fcdce8c4SStefano Zampini  #endif
1904ccdfe979SStefano Zampini 
1905ccdfe979SStefano Zampini   PetscFunctionBegin;
1906ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1907*fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1908afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1909*fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1910*fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1911*fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1912afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1913afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1914*fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1915afb2bd1cSJunchao Zhang  #endif
1916ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1917ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1918ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1919ccdfe979SStefano Zampini }
1920ccdfe979SStefano Zampini 
1921ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1922ccdfe979SStefano Zampini 
1923ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1924ccdfe979SStefano Zampini {
1925ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1926ccdfe979SStefano Zampini   Mat                          A,B;
1927afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1928ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1929ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1930ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1931ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1932ccdfe979SStefano Zampini   const PetscScalar            *barray;
1933ccdfe979SStefano Zampini   PetscScalar                  *carray;
1934ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1935ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1936ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1937ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1938afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1939ccdfe979SStefano Zampini 
1940ccdfe979SStefano Zampini   PetscFunctionBegin;
1941ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1942ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1943ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1944ccdfe979SStefano Zampini   A    = product->A;
1945ccdfe979SStefano Zampini   B    = product->B;
1946ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1947ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1948ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1949ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1950ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1951ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1952ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1953ccdfe979SStefano Zampini   switch (product->type) {
1954ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1955ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1956ccdfe979SStefano Zampini     mat = cusp->mat;
1957ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1958ccdfe979SStefano Zampini     m   = A->rmap->n;
1959ccdfe979SStefano Zampini     n   = B->cmap->n;
1960ccdfe979SStefano Zampini     break;
1961ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1962e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1963e6e9a74fSStefano Zampini       mat = cusp->mat;
1964e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1965e6e9a74fSStefano Zampini     } else {
1966ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1967ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1968ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1969e6e9a74fSStefano Zampini     }
1970ccdfe979SStefano Zampini     m = A->cmap->n;
1971ccdfe979SStefano Zampini     n = B->cmap->n;
1972ccdfe979SStefano Zampini     break;
1973ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1974ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1975ccdfe979SStefano Zampini     mat = cusp->mat;
1976ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1977ccdfe979SStefano Zampini     m   = A->rmap->n;
1978ccdfe979SStefano Zampini     n   = B->rmap->n;
1979ccdfe979SStefano Zampini     break;
1980ccdfe979SStefano Zampini   default:
1981ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1982ccdfe979SStefano Zampini   }
1983ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1984ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1985ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1986ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1987afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1988ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1989afb2bd1cSJunchao Zhang 
1990ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1991c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1992c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1993c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1994c8378d12SStefano Zampini   } else {
1995c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1996c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1997c8378d12SStefano Zampini   }
1998c8378d12SStefano Zampini 
1999c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2000afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2001afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2002*fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2003afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2004*fcdce8c4SStefano Zampini     size_t mmBufferSize;
2005afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2006afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2007afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2008afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2009afb2bd1cSJunchao Zhang     }
2010c8378d12SStefano Zampini 
2011afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2012afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2013afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2014afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2015afb2bd1cSJunchao Zhang     }
2016afb2bd1cSJunchao Zhang 
2017afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2018afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2019afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2020afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2021afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2022afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2023afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2024afb2bd1cSJunchao Zhang     }
2025afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2026afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2027afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2028*fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2029*fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2030*fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2031*fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2032*fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2033*fcdce8c4SStefano Zampini     }
2034afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2035afb2bd1cSJunchao Zhang   } else {
2036afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2037afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2038afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2039afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2040afb2bd1cSJunchao Zhang   }
2041afb2bd1cSJunchao Zhang 
2042afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2043afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2044afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2045afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2046*fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2047afb2bd1cSJunchao Zhang  #else
2048afb2bd1cSJunchao Zhang   PetscInt k;
2049afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2050ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2051ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2052ccdfe979SStefano Zampini     cublasStatus_t cerr;
2053ccdfe979SStefano Zampini 
2054ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2055ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2056ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2057ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2058ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2059ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2060ccdfe979SStefano Zampini     blda = B->cmap->n;
2061afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2062afb2bd1cSJunchao Zhang   } else {
2063afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2064ccdfe979SStefano Zampini   }
2065ccdfe979SStefano Zampini 
2066afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2067ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2068afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2069ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2070ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2071ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2072ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2073ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2074afb2bd1cSJunchao Zhang  #endif
2075afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2076c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2077c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2078ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2079ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2080ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2081ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2082ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2083ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2084ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2085ccdfe979SStefano Zampini   } else {
2086ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2087ccdfe979SStefano Zampini   }
2088ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2089ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2090ccdfe979SStefano Zampini   }
2091ccdfe979SStefano Zampini   if (!biscuda) {
2092ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2093ccdfe979SStefano Zampini   }
2094ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2095ccdfe979SStefano Zampini }
2096ccdfe979SStefano Zampini 
2097ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2098ccdfe979SStefano Zampini {
2099ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2100ccdfe979SStefano Zampini   Mat                A,B;
2101ccdfe979SStefano Zampini   PetscInt           m,n;
2102ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2103ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2104ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2105ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2106ccdfe979SStefano Zampini 
2107ccdfe979SStefano Zampini   PetscFunctionBegin;
2108ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2109ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2110ccdfe979SStefano Zampini   A    = product->A;
2111ccdfe979SStefano Zampini   B    = product->B;
2112ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2113ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2114ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2115ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2116ccdfe979SStefano Zampini   switch (product->type) {
2117ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2118ccdfe979SStefano Zampini     m = A->rmap->n;
2119ccdfe979SStefano Zampini     n = B->cmap->n;
2120ccdfe979SStefano Zampini     break;
2121ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2122ccdfe979SStefano Zampini     m = A->cmap->n;
2123ccdfe979SStefano Zampini     n = B->cmap->n;
2124ccdfe979SStefano Zampini     break;
2125ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2126ccdfe979SStefano Zampini     m = A->rmap->n;
2127ccdfe979SStefano Zampini     n = B->rmap->n;
2128ccdfe979SStefano Zampini     break;
2129ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2130ccdfe979SStefano Zampini     m = B->cmap->n;
2131ccdfe979SStefano Zampini     n = B->cmap->n;
2132ccdfe979SStefano Zampini     break;
2133ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2134ccdfe979SStefano Zampini     m = B->rmap->n;
2135ccdfe979SStefano Zampini     n = B->rmap->n;
2136ccdfe979SStefano Zampini     break;
2137ccdfe979SStefano Zampini   default:
2138ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2139ccdfe979SStefano Zampini   }
2140ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2141ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2142ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2143ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2144ccdfe979SStefano Zampini 
2145ccdfe979SStefano Zampini   /* product data */
2146ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2147ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2148afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2149afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2150ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2151afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2152ccdfe979SStefano Zampini   }
2153afb2bd1cSJunchao Zhang  #endif
2154ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2155ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2156ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2157ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2158ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2159ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2160ccdfe979SStefano Zampini     } else {
2161ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2162ccdfe979SStefano Zampini     }
2163ccdfe979SStefano Zampini   }
2164ccdfe979SStefano Zampini   C->product->data    = mmdata;
2165ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2166ccdfe979SStefano Zampini 
2167ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2168ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2169ccdfe979SStefano Zampini }
2170ccdfe979SStefano Zampini 
2171*fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2172ccdfe979SStefano Zampini {
2173ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2174*fcdce8c4SStefano Zampini   Mat                          A,B;
2175*fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2176*fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2177*fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2178*fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2179*fcdce8c4SStefano Zampini   PetscBool                    flg;
2180ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2181*fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2182*fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2183*fcdce8c4SStefano Zampini   MatProductType               ptype;
2184*fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2185*fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2186*fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2187*fcdce8c4SStefano Zampini #endif
2188ccdfe979SStefano Zampini 
2189ccdfe979SStefano Zampini   PetscFunctionBegin;
2190ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2191*fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2192*fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2193*fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2194*fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2195*fcdce8c4SStefano Zampini   A = product->A;
2196*fcdce8c4SStefano Zampini   B = product->B;
2197*fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2198*fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2199*fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2200*fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2201*fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2202*fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2203*fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2204*fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2205*fcdce8c4SStefano Zampini     goto finalize;
2206*fcdce8c4SStefano Zampini   }
2207*fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2208*fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2209*fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2210*fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2211*fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2212*fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2213*fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2214*fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2215*fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2216*fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2217*fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2218*fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2219*fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2220*fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2221*fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2222*fcdce8c4SStefano Zampini 
2223*fcdce8c4SStefano Zampini   ptype = product->type;
2224*fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2225*fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2226*fcdce8c4SStefano Zampini   switch (ptype) {
2227*fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2228*fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2229*fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2230*fcdce8c4SStefano Zampini     break;
2231*fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2232*fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2233*fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2234*fcdce8c4SStefano Zampini     break;
2235*fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2236*fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2237*fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2238*fcdce8c4SStefano Zampini     break;
2239*fcdce8c4SStefano Zampini   default:
2240*fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2241*fcdce8c4SStefano Zampini   }
2242*fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2243*fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2244*fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2245*fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2246*fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2247*fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2248*fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2249*fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2250*fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2251*fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2252*fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2253*fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2254*fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2255*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2256*fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2257*fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2258*fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2259*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2260*fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2261*fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2262*fcdce8c4SStefano Zampini #else
2263*fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2264*fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2265*fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2266*fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2267*fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2268*fcdce8c4SStefano Zampini #endif
2269*fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2270*fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2271*fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2272*fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2273*fcdce8c4SStefano Zampini finalize:
2274*fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2275*fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2276*fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2277*fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2278*fcdce8c4SStefano Zampini   c->reallocs         = 0;
2279*fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2280*fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2281*fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2282*fcdce8c4SStefano Zampini   C->num_ass++;
2283*fcdce8c4SStefano Zampini   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
2284*fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(C);CHKERRQ(ierr);
2285ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2286ccdfe979SStefano Zampini }
2287*fcdce8c4SStefano Zampini 
2288*fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2289*fcdce8c4SStefano Zampini {
2290*fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2291*fcdce8c4SStefano Zampini   Mat                          A,B;
2292*fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2293*fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2294*fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2295*fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2296*fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2297*fcdce8c4SStefano Zampini   PetscBool                    flg;
2298*fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2299*fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2300*fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2301*fcdce8c4SStefano Zampini   MatProductType               ptype;
2302*fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2303*fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2304*fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2305*fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2306*fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2307*fcdce8c4SStefano Zampini   size_t                       bufSize2;
2308*fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2309*fcdce8c4SStefano Zampini #else
2310*fcdce8c4SStefano Zampini   int                          cnz;
2311*fcdce8c4SStefano Zampini #endif
2312*fcdce8c4SStefano Zampini 
2313*fcdce8c4SStefano Zampini   PetscFunctionBegin;
2314*fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2315*fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2316*fcdce8c4SStefano Zampini   A    = product->A;
2317*fcdce8c4SStefano Zampini   B    = product->B;
2318*fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2319*fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2320*fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2321*fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2322*fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2323*fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2324*fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2325*fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2326*fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2327*fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2328*fcdce8c4SStefano Zampini 
2329*fcdce8c4SStefano Zampini   /* product data */
2330*fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2331*fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2332*fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2333*fcdce8c4SStefano Zampini 
2334*fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2335*fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2336*fcdce8c4SStefano Zampini   ptype = product->type;
2337*fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2338*fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2339*fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2340*fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2341*fcdce8c4SStefano Zampini   switch (ptype) {
2342*fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2343*fcdce8c4SStefano Zampini     m = A->rmap->n;
2344*fcdce8c4SStefano Zampini     n = B->cmap->n;
2345*fcdce8c4SStefano Zampini     k = A->cmap->n;
2346*fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2347*fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2348*fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2349*fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2350*fcdce8c4SStefano Zampini     break;
2351*fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2352*fcdce8c4SStefano Zampini     m = A->cmap->n;
2353*fcdce8c4SStefano Zampini     n = B->cmap->n;
2354*fcdce8c4SStefano Zampini     k = A->rmap->n;
2355*fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2356*fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2357*fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2358*fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2359*fcdce8c4SStefano Zampini     break;
2360*fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2361*fcdce8c4SStefano Zampini     m = A->rmap->n;
2362*fcdce8c4SStefano Zampini     n = B->rmap->n;
2363*fcdce8c4SStefano Zampini     k = A->cmap->n;
2364*fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2365*fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2366*fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2367*fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2368*fcdce8c4SStefano Zampini     break;
2369*fcdce8c4SStefano Zampini   default:
2370*fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2371*fcdce8c4SStefano Zampini   }
2372*fcdce8c4SStefano Zampini 
2373*fcdce8c4SStefano Zampini   /* create cusparse matrix */
2374*fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2375*fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2376*fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2377*fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2378*fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2379*fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2380*fcdce8c4SStefano Zampini 
2381*fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2382*fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2383*fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2384*fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2385*fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2386*fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2387*fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2388*fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2389*fcdce8c4SStefano Zampini   } else {
2390*fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2391*fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2392*fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2393*fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2394*fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2395*fcdce8c4SStefano Zampini   }
2396*fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2397*fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2398*fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2399*fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2400*fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2401*fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2402*fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2403*fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2404*fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2405*fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2406*fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2407*fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2408*fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2409*fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2410*fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2411*fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2412*fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2413*fcdce8c4SStefano Zampini     c->nz = 0;
2414*fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2415*fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2416*fcdce8c4SStefano Zampini     goto finalizesym;
2417*fcdce8c4SStefano Zampini   }
2418*fcdce8c4SStefano Zampini 
2419*fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2420*fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2421*fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2422*fcdce8c4SStefano Zampini   if (!biscompressed) {
2423*fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2424*fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2425*fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2426*fcdce8c4SStefano Zampini #endif
2427*fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2428*fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2429*fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2430*fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2431*fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2432*fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2433*fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2434*fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2435*fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2436*fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2437*fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2438*fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2439*fcdce8c4SStefano Zampini     }
2440*fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2441*fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2442*fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2443*fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2444*fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2445*fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2446*fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2447*fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2448*fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2449*fcdce8c4SStefano Zampini     }
2450*fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2451*fcdce8c4SStefano Zampini #endif
2452*fcdce8c4SStefano Zampini   }
2453*fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2454*fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2455*fcdce8c4SStefano Zampini   /* precompute flops count */
2456*fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2457*fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2458*fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2459*fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2460*fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2461*fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2462*fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2463*fcdce8c4SStefano Zampini       }
2464*fcdce8c4SStefano Zampini     }
2465*fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2466*fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2467*fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2468*fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2469*fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2470*fcdce8c4SStefano Zampini     }
2471*fcdce8c4SStefano Zampini   } else { /* TODO */
2472*fcdce8c4SStefano Zampini     flops = 0.;
2473*fcdce8c4SStefano Zampini   }
2474*fcdce8c4SStefano Zampini 
2475*fcdce8c4SStefano Zampini   mmdata->flops = flops;
2476*fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2477*fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2478*fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2479*fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2480*fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2481*fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2482*fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2483*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2484*fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2485*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2486*fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2487*fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2488*fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2489*fcdce8c4SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat);
2490*fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2491*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2492*fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2493*fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2494*fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2495*fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2496*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2497*fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2498*fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2499*fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2500*fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2501*fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2502*fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2503*fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2504*fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2505*fcdce8c4SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat);
2506*fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2507*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2508*fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2509*fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2510*fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2511*fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2512*fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2513*fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
2514*fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2515*fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2516*fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2517*fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2518*fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2519*fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2520*fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2521*fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2522*fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2523*fcdce8c4SStefano Zampini #else
2524*fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2525*fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2526*fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2527*fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2528*fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2529*fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2530*fcdce8c4SStefano Zampini   c->nz = cnz;
2531*fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2532*fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2533*fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2534*fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2535*fcdce8c4SStefano Zampini 
2536*fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2537*fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2538*fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2539*fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2540*fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2541*fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2542*fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2543*fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2544*fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2545*fcdce8c4SStefano Zampini #endif
2546*fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2547*fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2548*fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2549*fcdce8c4SStefano Zampini finalizesym:
2550*fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2551*fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2552*fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2553*fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2554*fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2555*fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2556*fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2557*fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2558*fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2559*fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2560*fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2561*fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2562*fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2563*fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2564*fcdce8c4SStefano Zampini   } else {
2565*fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2566*fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2567*fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2568*fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2569*fcdce8c4SStefano Zampini   }
2570*fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2571*fcdce8c4SStefano Zampini     PetscInt r = 0;
2572*fcdce8c4SStefano Zampini     c->i[0] = 0;
2573*fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2574*fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2575*fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2576*fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2577*fcdce8c4SStefano Zampini     }
2578*fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2579*fcdce8c4SStefano Zampini   }
2580*fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2581*fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2582*fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2583*fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2584*fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2585*fcdce8c4SStefano Zampini   c->rmax = 0;
2586*fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2587*fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2588*fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2589*fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2590*fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2591*fcdce8c4SStefano Zampini   }
2592*fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2593*fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2594*fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2595*fcdce8c4SStefano Zampini 
2596*fcdce8c4SStefano Zampini   C->nonzerostate++;
2597*fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2598*fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2599*fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2600*fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2601*fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2602*fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2603*fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2604*fcdce8c4SStefano Zampini   if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2605*fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2606*fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2607*fcdce8c4SStefano Zampini   }
2608*fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2609*fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2610*fcdce8c4SStefano Zampini }
2611*fcdce8c4SStefano Zampini 
2612*fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2613*fcdce8c4SStefano Zampini 
2614*fcdce8c4SStefano Zampini /* handles sparse or dense B */
2615*fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2616*fcdce8c4SStefano Zampini {
2617*fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2618*fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2619*fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2620*fcdce8c4SStefano Zampini 
2621*fcdce8c4SStefano Zampini   PetscFunctionBegin;
2622*fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2623*fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2624*fcdce8c4SStefano Zampini   if (!product->B->boundtocpu) {
2625*fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2626*fcdce8c4SStefano Zampini   }
2627*fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2628*fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2629*fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2630*fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2631*fcdce8c4SStefano Zampini     }
2632*fcdce8c4SStefano Zampini   }
2633*fcdce8c4SStefano Zampini   if (isdense) {
2634ccdfe979SStefano Zampini     switch (product->type) {
2635ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2636ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2637ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2638ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2639ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2640*fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2641*fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2642*fcdce8c4SStefano Zampini       } else {
2643*fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2644*fcdce8c4SStefano Zampini       }
2645*fcdce8c4SStefano Zampini       break;
2646*fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2647*fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2648*fcdce8c4SStefano Zampini       break;
2649ccdfe979SStefano Zampini     default:
2650ccdfe979SStefano Zampini       break;
2651ccdfe979SStefano Zampini     }
2652*fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2653*fcdce8c4SStefano Zampini     switch (product->type) {
2654*fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2655*fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2656*fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2657*fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2658*fcdce8c4SStefano Zampini       break;
2659*fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2660*fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2661*fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2662*fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2663*fcdce8c4SStefano Zampini       break;
2664*fcdce8c4SStefano Zampini     default:
2665*fcdce8c4SStefano Zampini       break;
2666*fcdce8c4SStefano Zampini     }
2667*fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2668*fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2669*fcdce8c4SStefano Zampini   }
2670ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2671ccdfe979SStefano Zampini }
2672ccdfe979SStefano Zampini 
26736fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
26749ae82921SPaul Mullowney {
2675b175d8bbSPaul Mullowney   PetscErrorCode ierr;
26769ae82921SPaul Mullowney 
26779ae82921SPaul Mullowney   PetscFunctionBegin;
2678e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2679e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2680e6e9a74fSStefano Zampini }
2681e6e9a74fSStefano Zampini 
2682e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2683e6e9a74fSStefano Zampini {
2684e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2685e6e9a74fSStefano Zampini 
2686e6e9a74fSStefano Zampini   PetscFunctionBegin;
2687e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2688e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2689e6e9a74fSStefano Zampini }
2690e6e9a74fSStefano Zampini 
2691e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2692e6e9a74fSStefano Zampini {
2693e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2694e6e9a74fSStefano Zampini 
2695e6e9a74fSStefano Zampini   PetscFunctionBegin;
2696e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2697e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2698e6e9a74fSStefano Zampini }
2699e6e9a74fSStefano Zampini 
2700e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2701e6e9a74fSStefano Zampini {
2702e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2703e6e9a74fSStefano Zampini 
2704e6e9a74fSStefano Zampini   PetscFunctionBegin;
2705e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27069ae82921SPaul Mullowney   PetscFunctionReturn(0);
27079ae82921SPaul Mullowney }
27089ae82921SPaul Mullowney 
27096fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2710ca45077fSPaul Mullowney {
2711b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2712ca45077fSPaul Mullowney 
2713ca45077fSPaul Mullowney   PetscFunctionBegin;
2714e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2715ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2716ca45077fSPaul Mullowney }
2717ca45077fSPaul Mullowney 
2718afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2719e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27209ae82921SPaul Mullowney {
27219ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2722aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27239ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2724e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2725b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
272657d48284SJunchao Zhang   cudaError_t                  cerr;
2727aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2728e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2729e6e9a74fSStefano Zampini   PetscBool                    compressed;
2730afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2731afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2732afb2bd1cSJunchao Zhang #endif
27336e111a19SKarl Rupp 
27349ae82921SPaul Mullowney   PetscFunctionBegin;
2735e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2736e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2737afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2738d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2739e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2740e6e9a74fSStefano Zampini   }
274134d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
274234d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2743e6e9a74fSStefano Zampini   if (!trans) {
27449ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2745c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2746e6e9a74fSStefano Zampini   } else {
2747e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2748e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2749e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2750e6e9a74fSStefano Zampini     } else {
2751afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2752e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2753e6e9a74fSStefano Zampini     }
2754e6e9a74fSStefano Zampini   }
2755e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2756e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2757213423ffSJunchao Zhang 
2758e6e9a74fSStefano Zampini   try {
2759e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2760213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2761213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2762afb2bd1cSJunchao Zhang 
276385ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2764e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2765afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2766afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2767afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2768afb2bd1cSJunchao Zhang       */
2769e6e9a74fSStefano Zampini       xptr = xarray;
2770afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2771213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2772afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2773afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2774afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2775afb2bd1cSJunchao Zhang        */
2776afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2777afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2778afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2779afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2780afb2bd1cSJunchao Zhang       }
2781afb2bd1cSJunchao Zhang      #endif
2782e6e9a74fSStefano Zampini     } else {
2783afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2784afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2785afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2786afb2bd1cSJunchao Zhang        */
2787afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2788e6e9a74fSStefano Zampini       dptr = zarray;
2789e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2790afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2791e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2792e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2793e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2794e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2795e6e9a74fSStefano Zampini       }
2796afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2797afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2798afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2799afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2800afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2801afb2bd1cSJunchao Zhang       }
2802afb2bd1cSJunchao Zhang      #endif
2803e6e9a74fSStefano Zampini     }
28049ae82921SPaul Mullowney 
2805afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2806aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2807afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2808afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2809afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2810afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2811afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2812afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2813afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2814afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2815afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2816afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2817afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2818afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2819afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2820afb2bd1cSJunchao Zhang 
2821afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2822afb2bd1cSJunchao Zhang       } else {
2823afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2824afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2825afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2826afb2bd1cSJunchao Zhang       }
2827afb2bd1cSJunchao Zhang 
2828afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2829afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2830afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2831afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2832afb2bd1cSJunchao Zhang                                beta,
2833afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2834afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2835afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2836afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2837afb2bd1cSJunchao Zhang      #else
28387656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2839e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2840a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2841afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2842aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2843e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
284457d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2845afb2bd1cSJunchao Zhang      #endif
2846aa372e3fSPaul Mullowney     } else {
2847213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2848afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2849afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2850afb2bd1cSJunchao Zhang        #else
2851301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2852e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2853afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2854e6e9a74fSStefano Zampini                                  xptr, beta,
285557d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2856afb2bd1cSJunchao Zhang        #endif
2857a65300a6SPaul Mullowney       }
2858aa372e3fSPaul Mullowney     }
285905035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2860958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2861aa372e3fSPaul Mullowney 
2862e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2863213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2864213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2865213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2866e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2867213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
28687656d835SStefano Zampini         }
2869213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2870c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
28717656d835SStefano Zampini       }
28727656d835SStefano Zampini 
2873213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2874213423ffSJunchao Zhang       if (compressed) {
2875213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2876e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2877c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2878e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2879c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
288005035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2881958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2882e6e9a74fSStefano Zampini       }
2883e6e9a74fSStefano Zampini     } else {
2884e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2885e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2886e6e9a74fSStefano Zampini       }
2887e6e9a74fSStefano Zampini     }
2888e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2889213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2890213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
28919ae82921SPaul Mullowney   } catch(char *ex) {
28929ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
28939ae82921SPaul Mullowney   }
2894e6e9a74fSStefano Zampini   if (yy) {
2895958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2896e6e9a74fSStefano Zampini   } else {
2897e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2898e6e9a74fSStefano Zampini   }
28999ae82921SPaul Mullowney   PetscFunctionReturn(0);
29009ae82921SPaul Mullowney }
29019ae82921SPaul Mullowney 
29026fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2903ca45077fSPaul Mullowney {
2904b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29056e111a19SKarl Rupp 
2906ca45077fSPaul Mullowney   PetscFunctionBegin;
2907e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2908ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2909ca45077fSPaul Mullowney }
2910ca45077fSPaul Mullowney 
29116fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29129ae82921SPaul Mullowney {
29139ae82921SPaul Mullowney   PetscErrorCode              ierr;
2914a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29159ae82921SPaul Mullowney   PetscFunctionBegin;
2916bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29173fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2918bc3f50f2SPaul Mullowney   }
29193fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29203fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2921a587d139SMark   if (d_mat) {
29223fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29233fa6b06aSMark Adams   }
29243fa6b06aSMark Adams 
29259ae82921SPaul Mullowney   PetscFunctionReturn(0);
29269ae82921SPaul Mullowney }
29279ae82921SPaul Mullowney 
29289ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2929e057df02SPaul Mullowney /*@
29309ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2931e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2932e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2933e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2934e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2935e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29369ae82921SPaul Mullowney 
2937d083f849SBarry Smith    Collective
29389ae82921SPaul Mullowney 
29399ae82921SPaul Mullowney    Input Parameters:
29409ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29419ae82921SPaul Mullowney .  m - number of rows
29429ae82921SPaul Mullowney .  n - number of columns
29439ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29449ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29450298fd71SBarry Smith          (possibly different for each row) or NULL
29469ae82921SPaul Mullowney 
29479ae82921SPaul Mullowney    Output Parameter:
29489ae82921SPaul Mullowney .  A - the matrix
29499ae82921SPaul Mullowney 
29509ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
29519ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
29529ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
29539ae82921SPaul Mullowney 
29549ae82921SPaul Mullowney    Notes:
29559ae82921SPaul Mullowney    If nnz is given then nz is ignored
29569ae82921SPaul Mullowney 
29579ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
29589ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
29599ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
29609ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
29619ae82921SPaul Mullowney 
29629ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
29630298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
29649ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
29659ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
29669ae82921SPaul Mullowney 
29679ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
29689ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
29699ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
29709ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
29719ae82921SPaul Mullowney 
29729ae82921SPaul Mullowney    Level: intermediate
29739ae82921SPaul Mullowney 
2974e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
29759ae82921SPaul Mullowney @*/
29769ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
29779ae82921SPaul Mullowney {
29789ae82921SPaul Mullowney   PetscErrorCode ierr;
29799ae82921SPaul Mullowney 
29809ae82921SPaul Mullowney   PetscFunctionBegin;
29819ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
29829ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
29839ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
29849ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
29859ae82921SPaul Mullowney   PetscFunctionReturn(0);
29869ae82921SPaul Mullowney }
29879ae82921SPaul Mullowney 
29886fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
29899ae82921SPaul Mullowney {
29909ae82921SPaul Mullowney   PetscErrorCode              ierr;
29913fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
2992ab25e6cbSDominic Meiser 
29939ae82921SPaul Mullowney   PetscFunctionBegin;
29949ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29953fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
29963fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2997470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
29989ae82921SPaul Mullowney   } else {
2999470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3000aa372e3fSPaul Mullowney   }
30013fa6b06aSMark Adams   if (d_mat) {
30023fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30033fa6b06aSMark Adams     cudaError_t                err;
30043fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30053fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30063fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30073fa6b06aSMark Adams     if (a->compressedrow.use) {
30083fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30093fa6b06aSMark Adams     }
30103fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30113fa6b06aSMark Adams   }
3012ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3013ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3014ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3015*fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3016ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30177e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30187e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30199ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30209ae82921SPaul Mullowney   PetscFunctionReturn(0);
30219ae82921SPaul Mullowney }
30229ae82921SPaul Mullowney 
3023ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
302495639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30259ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30269ff858a8SKarl Rupp {
30279ff858a8SKarl Rupp   PetscErrorCode ierr;
30289ff858a8SKarl Rupp 
30299ff858a8SKarl Rupp   PetscFunctionBegin;
30309ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3031ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30329ff858a8SKarl Rupp   PetscFunctionReturn(0);
30339ff858a8SKarl Rupp }
30349ff858a8SKarl Rupp 
3035a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
303695639643SRichard Tran Mills {
3037e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3038a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3039a587d139SMark   PetscBool      flgx,flgy;
3040e6e9a74fSStefano Zampini 
304195639643SRichard Tran Mills   PetscFunctionBegin;
3042a587d139SMark   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
3043a587d139SMark   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
3044a587d139SMark   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
3045a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
3046a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
3047a587d139SMark   if (!flgx || !flgy) {
3048a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3049a587d139SMark     PetscFunctionReturn(0);
305095639643SRichard Tran Mills   }
3051a587d139SMark   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
3052a587d139SMark   if (str == DIFFERENT_NONZERO_PATTERN) {
3053a587d139SMark     if (x->nz == y->nz) {
3054a587d139SMark       PetscBool e;
3055a587d139SMark       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
3056a587d139SMark       if (e) {
3057a587d139SMark         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
3058a587d139SMark         if (e) {
3059a587d139SMark           str = SAME_NONZERO_PATTERN;
3060a587d139SMark         }
3061a587d139SMark       }
3062a587d139SMark     }
3063a587d139SMark   }
3064a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
3065a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
3066a587d139SMark     PetscFunctionReturn(0);
3067a587d139SMark   } else {
3068a587d139SMark     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3069a587d139SMark     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
3070a587d139SMark     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3071a587d139SMark     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3072a587d139SMark     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
3073a587d139SMark       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
3074a587d139SMark         ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3075a587d139SMark       }
3076a587d139SMark       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
3077a587d139SMark         ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
3078a587d139SMark       }
3079a587d139SMark       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
3080a587d139SMark       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3081a587d139SMark       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3082a587d139SMark     } else {
3083a587d139SMark       cublasHandle_t cublasv2handle;
3084a587d139SMark       cublasStatus_t cberr;
3085a587d139SMark       cudaError_t    err;
3086a587d139SMark       PetscScalar    alpha = a;
3087a587d139SMark       PetscBLASInt   one = 1, bnz = 1;
3088a587d139SMark       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
3089a587d139SMark       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
3090a587d139SMark       PetscScalar    *aa_y, *aa_x;
3091a587d139SMark       aa_y = matrix_y->values->data().get();
3092a587d139SMark       aa_x = matrix_x->values->data().get();
3093a587d139SMark       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3094a587d139SMark       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3095a587d139SMark       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3096a587d139SMark       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
3097a587d139SMark       err  = WaitForCUDA();CHKERRCUDA(err);
3098a587d139SMark       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3099a587d139SMark       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3100a587d139SMark       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3101a587d139SMark       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
3102a587d139SMark       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
3103a587d139SMark       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
3104a587d139SMark       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
3105a587d139SMark     }
3106a587d139SMark   }
310795639643SRichard Tran Mills   PetscFunctionReturn(0);
310895639643SRichard Tran Mills }
310995639643SRichard Tran Mills 
31103fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
31113fa6b06aSMark Adams {
31123fa6b06aSMark Adams   PetscErrorCode             ierr;
31137e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3114a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
31157e8381f9SStefano Zampini 
31163fa6b06aSMark Adams   PetscFunctionBegin;
31173fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
31183fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
31197e8381f9SStefano Zampini     if (spptr->mat) {
31207e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
31217e8381f9SStefano Zampini       if (matrix->values) {
31227e8381f9SStefano Zampini         both = PETSC_TRUE;
31237e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31247e8381f9SStefano Zampini       }
31257e8381f9SStefano Zampini     }
31267e8381f9SStefano Zampini     if (spptr->matTranspose) {
31277e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
31287e8381f9SStefano Zampini       if (matrix->values) {
31297e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31307e8381f9SStefano Zampini       }
31317e8381f9SStefano Zampini     }
31323fa6b06aSMark Adams   }
3133a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3134a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3135a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
31367e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3137a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
31383fa6b06aSMark Adams 
31393fa6b06aSMark Adams   PetscFunctionReturn(0);
31403fa6b06aSMark Adams }
31413fa6b06aSMark Adams 
3142a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3143a587d139SMark {
3144a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3145a587d139SMark   PetscErrorCode ierr;
3146a587d139SMark 
3147a587d139SMark   PetscFunctionBegin;
3148a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3149a587d139SMark   if (flg) {
3150a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3151a587d139SMark 
3152a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3153a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3154a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3155a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3156a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3157a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3158a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3159a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3160*fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3161a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3162a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3163a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3164a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3165a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3166*fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3167a587d139SMark   } else {
3168a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3169a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3170a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3171a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3172a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3173a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3174a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3175a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3176*fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3177a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3178a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3179a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3180a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3181a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3182*fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3183a587d139SMark   }
3184a587d139SMark   A->boundtocpu = flg;
3185a587d139SMark   a->inode.use = flg;
3186a587d139SMark   PetscFunctionReturn(0);
3187a587d139SMark }
3188a587d139SMark 
318949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
31909ae82921SPaul Mullowney {
31919ae82921SPaul Mullowney   PetscErrorCode   ierr;
3192aa372e3fSPaul Mullowney   cusparseStatus_t stat;
319349735bf3SStefano Zampini   Mat              B;
31949ae82921SPaul Mullowney 
31959ae82921SPaul Mullowney   PetscFunctionBegin;
3196832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
319749735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
319849735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
319949735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
320049735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
320149735bf3SStefano Zampini   }
320249735bf3SStefano Zampini   B = *newmat;
320349735bf3SStefano Zampini 
320434136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
320534136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
320634136279SStefano Zampini 
320749735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
32089ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3209e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3210e6e9a74fSStefano Zampini 
3211e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3212e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
3213e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3214e6e9a74fSStefano Zampini       B->spptr = spptr;
32153fa6b06aSMark Adams       spptr->deviceMat = NULL;
32169ae82921SPaul Mullowney     } else {
3217e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3218e6e9a74fSStefano Zampini 
3219e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3220e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3221e6e9a74fSStefano Zampini       B->spptr = spptr;
32229ae82921SPaul Mullowney     }
3223e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
322449735bf3SStefano Zampini   }
3225693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
32269ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
32279ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
322895639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3229693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
32302205254eSKarl Rupp 
3231e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
32329ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3233bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
32349ae82921SPaul Mullowney   PetscFunctionReturn(0);
32359ae82921SPaul Mullowney }
32369ae82921SPaul Mullowney 
323702fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
323802fe1965SBarry Smith {
323902fe1965SBarry Smith   PetscErrorCode ierr;
324002fe1965SBarry Smith 
324102fe1965SBarry Smith   PetscFunctionBegin;
324202fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
32430ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3244afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3245afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3246afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
324702fe1965SBarry Smith   PetscFunctionReturn(0);
324802fe1965SBarry Smith }
324902fe1965SBarry Smith 
32503ca39a21SBarry Smith /*MC
3251e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3252e057df02SPaul Mullowney 
3253e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
32542692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
32552692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3256e057df02SPaul Mullowney 
3257e057df02SPaul Mullowney    Options Database Keys:
3258e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3259aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3260a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3261e057df02SPaul Mullowney 
3262e057df02SPaul Mullowney   Level: beginner
3263e057df02SPaul Mullowney 
32648468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3265e057df02SPaul Mullowney M*/
32667f756511SDominic Meiser 
326742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
326842c9c57cSBarry Smith 
32690f39cd5aSBarry Smith 
32703ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
327142c9c57cSBarry Smith {
327242c9c57cSBarry Smith   PetscErrorCode ierr;
327342c9c57cSBarry Smith 
327442c9c57cSBarry Smith   PetscFunctionBegin;
32753ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
32763ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
32773ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
32783ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
327942c9c57cSBarry Smith   PetscFunctionReturn(0);
328042c9c57cSBarry Smith }
328129b38603SBarry Smith 
3282470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
32837f756511SDominic Meiser {
3284e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
32857f756511SDominic Meiser   cusparseStatus_t stat;
32867f756511SDominic Meiser 
32877f756511SDominic Meiser   PetscFunctionBegin;
32887f756511SDominic Meiser   if (*cusparsestruct) {
3289e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3290e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
32917f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
329281902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
32937e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
32947e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
32957e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_v;
32967e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_w;
32977e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3298afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3299afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3300afb2bd1cSJunchao Zhang    #endif
3301e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
33027f756511SDominic Meiser   }
33037f756511SDominic Meiser   PetscFunctionReturn(0);
33047f756511SDominic Meiser }
33057f756511SDominic Meiser 
33067f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
33077f756511SDominic Meiser {
33087f756511SDominic Meiser   PetscFunctionBegin;
33097f756511SDominic Meiser   if (*mat) {
33107f756511SDominic Meiser     delete (*mat)->values;
33117f756511SDominic Meiser     delete (*mat)->column_indices;
33127f756511SDominic Meiser     delete (*mat)->row_offsets;
33137f756511SDominic Meiser     delete *mat;
33147f756511SDominic Meiser     *mat = 0;
33157f756511SDominic Meiser   }
33167f756511SDominic Meiser   PetscFunctionReturn(0);
33177f756511SDominic Meiser }
33187f756511SDominic Meiser 
3319470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
33207f756511SDominic Meiser {
33217f756511SDominic Meiser   cusparseStatus_t stat;
33227f756511SDominic Meiser   PetscErrorCode   ierr;
33237f756511SDominic Meiser 
33247f756511SDominic Meiser   PetscFunctionBegin;
33257f756511SDominic Meiser   if (*trifactor) {
332657d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3327afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
33287f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
33291b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
33302cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3331afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33321b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3333afb2bd1cSJunchao Zhang    #endif
3334da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
33357f756511SDominic Meiser   }
33367f756511SDominic Meiser   PetscFunctionReturn(0);
33377f756511SDominic Meiser }
33387f756511SDominic Meiser 
3339470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
33407f756511SDominic Meiser {
33417f756511SDominic Meiser   CsrMatrix        *mat;
33427f756511SDominic Meiser   cusparseStatus_t stat;
33437f756511SDominic Meiser   cudaError_t      err;
33447f756511SDominic Meiser 
33457f756511SDominic Meiser   PetscFunctionBegin;
33467f756511SDominic Meiser   if (*matstruct) {
33477f756511SDominic Meiser     if ((*matstruct)->mat) {
33487f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3349afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3350afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3351afb2bd1cSJunchao Zhang        #else
33527f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
335357d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3354afb2bd1cSJunchao Zhang        #endif
33557f756511SDominic Meiser       } else {
33567f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
33577f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
33587f756511SDominic Meiser       }
33597f756511SDominic Meiser     }
336057d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
33617f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3362afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
33637656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
33647656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3365afb2bd1cSJunchao Zhang 
3366afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3367afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3368afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3369afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3370afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3371afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3372afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3373afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3374afb2bd1cSJunchao Zhang       }
3375afb2bd1cSJunchao Zhang     }
3376afb2bd1cSJunchao Zhang    #endif
33777f756511SDominic Meiser     delete *matstruct;
33787e8381f9SStefano Zampini     *matstruct = NULL;
33797f756511SDominic Meiser   }
33807f756511SDominic Meiser   PetscFunctionReturn(0);
33817f756511SDominic Meiser }
33827f756511SDominic Meiser 
3383ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
33847f756511SDominic Meiser {
3385e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3386e6e9a74fSStefano Zampini 
33877f756511SDominic Meiser   PetscFunctionBegin;
33887f756511SDominic Meiser   if (*trifactors) {
3389e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3390e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3391e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3392e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
33937f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
33947f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
33957f756511SDominic Meiser     delete (*trifactors)->workVector;
33967e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
33977e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
33987e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3399ccdfe979SStefano Zampini   }
3400ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3401ccdfe979SStefano Zampini }
3402ccdfe979SStefano Zampini 
3403ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3404ccdfe979SStefano Zampini {
3405e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3406ccdfe979SStefano Zampini   cusparseHandle_t handle;
3407ccdfe979SStefano Zampini   cusparseStatus_t stat;
3408ccdfe979SStefano Zampini 
3409ccdfe979SStefano Zampini   PetscFunctionBegin;
3410ccdfe979SStefano Zampini   if (*trifactors) {
3411e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
34127f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
341357d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
34147f756511SDominic Meiser     }
3415e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
34167f756511SDominic Meiser   }
34177f756511SDominic Meiser   PetscFunctionReturn(0);
34187f756511SDominic Meiser }
34197e8381f9SStefano Zampini 
34207e8381f9SStefano Zampini struct IJCompare
34217e8381f9SStefano Zampini {
34227e8381f9SStefano Zampini   __host__ __device__
34237e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
34247e8381f9SStefano Zampini   {
34257e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
34267e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
34277e8381f9SStefano Zampini     return false;
34287e8381f9SStefano Zampini   }
34297e8381f9SStefano Zampini };
34307e8381f9SStefano Zampini 
34317e8381f9SStefano Zampini struct IJEqual
34327e8381f9SStefano Zampini {
34337e8381f9SStefano Zampini   __host__ __device__
34347e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
34357e8381f9SStefano Zampini   {
34367e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
34377e8381f9SStefano Zampini     return true;
34387e8381f9SStefano Zampini   }
34397e8381f9SStefano Zampini };
34407e8381f9SStefano Zampini 
34417e8381f9SStefano Zampini struct IJDiff
34427e8381f9SStefano Zampini {
34437e8381f9SStefano Zampini   __host__ __device__
34447e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
34457e8381f9SStefano Zampini   {
34467e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
34477e8381f9SStefano Zampini   }
34487e8381f9SStefano Zampini };
34497e8381f9SStefano Zampini 
34507e8381f9SStefano Zampini struct IJSum
34517e8381f9SStefano Zampini {
34527e8381f9SStefano Zampini   __host__ __device__
34537e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
34547e8381f9SStefano Zampini   {
34557e8381f9SStefano Zampini     return t1||t2;
34567e8381f9SStefano Zampini   }
34577e8381f9SStefano Zampini };
34587e8381f9SStefano Zampini 
34597e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
34607e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
34617e8381f9SStefano Zampini {
34627e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3463*fcdce8c4SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
34647e8381f9SStefano Zampini   CsrMatrix          *matrix;
34657e8381f9SStefano Zampini   PetscErrorCode     ierr;
34667e8381f9SStefano Zampini   cudaError_t        cerr;
34677e8381f9SStefano Zampini   PetscInt           n;
34687e8381f9SStefano Zampini 
34697e8381f9SStefano Zampini   PetscFunctionBegin;
34707e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
34717e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
34727e8381f9SStefano Zampini   if (!cusp->cooPerm) {
34737e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
34747e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
34757e8381f9SStefano Zampini     PetscFunctionReturn(0);
34767e8381f9SStefano Zampini   }
34777e8381f9SStefano Zampini   n = cusp->cooPerm->size();
34787e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
34797e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
34807e8381f9SStefano Zampini   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
34817e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
34827e8381f9SStefano Zampini   if (v) {
34837e8381f9SStefano Zampini     cusp->cooPerm_v->assign(v,v+n);
34847e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
34857e8381f9SStefano Zampini   }
34867e8381f9SStefano Zampini   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
34877e8381f9SStefano Zampini   if (imode == ADD_VALUES) {
34887e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
34897e8381f9SStefano Zampini       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
34907e8381f9SStefano Zampini       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
34917e8381f9SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
34927e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
34937e8381f9SStefano Zampini     } else {
34947e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
34957e8381f9SStefano Zampini                                                                 matrix->values->begin()));
34967e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
34977e8381f9SStefano Zampini                                                                 matrix->values->end()));
34987e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
34997e8381f9SStefano Zampini     }
35007e8381f9SStefano Zampini   } else {
35017e8381f9SStefano Zampini     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
35027e8381f9SStefano Zampini                               if we are inserting two different values into the same location */
35037e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
35047e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
35057e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
35067e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
35077e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
35087e8381f9SStefano Zampini     } else {
35097e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
35107e8381f9SStefano Zampini                                                                 matrix->values->begin()));
35117e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
35127e8381f9SStefano Zampini                                                                 matrix->values->end()));
35137e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
35147e8381f9SStefano Zampini     }
35157e8381f9SStefano Zampini   }
35167e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
35177e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
35187e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3519*fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3520*fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3521*fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3522*fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3523*fcdce8c4SStefano Zampini   a->reallocs         = 0;
3524*fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3525*fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3526*fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3527*fcdce8c4SStefano Zampini   A->num_ass++;
35287e8381f9SStefano Zampini   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
35297e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
35307e8381f9SStefano Zampini   PetscFunctionReturn(0);
35317e8381f9SStefano Zampini }
35327e8381f9SStefano Zampini 
35337e8381f9SStefano Zampini #include <thrust/binary_search.h>
35347e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
35357e8381f9SStefano Zampini {
35367e8381f9SStefano Zampini   PetscErrorCode     ierr;
35377e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
35387e8381f9SStefano Zampini   CsrMatrix          *matrix;
35397e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
35407e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
35417e8381f9SStefano Zampini   cudaError_t        cerr;
35427e8381f9SStefano Zampini 
35437e8381f9SStefano Zampini   PetscFunctionBegin;
35447e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
35457e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
35467e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
35477e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
35487e8381f9SStefano Zampini   if (n != cooPerm_n) {
35497e8381f9SStefano Zampini     delete cusp->cooPerm;
35507e8381f9SStefano Zampini     delete cusp->cooPerm_v;
35517e8381f9SStefano Zampini     delete cusp->cooPerm_w;
35527e8381f9SStefano Zampini     delete cusp->cooPerm_a;
35537e8381f9SStefano Zampini     cusp->cooPerm = NULL;
35547e8381f9SStefano Zampini     cusp->cooPerm_v = NULL;
35557e8381f9SStefano Zampini     cusp->cooPerm_w = NULL;
35567e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
35577e8381f9SStefano Zampini   }
35587e8381f9SStefano Zampini   if (n) {
35597e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
35607e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
35617e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
35627e8381f9SStefano Zampini 
35637e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
35647e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
35657e8381f9SStefano Zampini 
35667e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
35677e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
35687e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
35697e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
35707e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
35717e8381f9SStefano Zampini 
35727e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
35737e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
35747e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
35757e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
35767e8381f9SStefano Zampini 
35777e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
35787e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
35797e8381f9SStefano Zampini       delete cusp->cooPerm_a;
35807e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
35817e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
35827e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
35837e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
35847e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
35857e8381f9SStefano Zampini       w[0] = 0;
35867e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
35877e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
35887e8381f9SStefano Zampini     }
35897e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
35907e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
35917e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
35927e8381f9SStefano Zampini                         ii.begin());
35937e8381f9SStefano Zampini 
35947e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
35957e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
35967e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
35977e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
35987e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
35997e8381f9SStefano Zampini     a->i[0] = 0;
36007e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36017e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3602*fcdce8c4SStefano Zampini     a->rmax = 0;
36037e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
36047e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
36057e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36067e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
36077e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
36087e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
36097e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
36107e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
36117e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3612*fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
36137e8381f9SStefano Zampini     }
3614*fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
36157e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
36167e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3617*fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
36187e8381f9SStefano Zampini   } else {
36197e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
36207e8381f9SStefano Zampini   }
36217e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
36227e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
36237e8381f9SStefano Zampini 
36247e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
36257e8381f9SStefano Zampini      The code is so convoluted now that I prefer to copy garbage to the GPU */
36267e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
36277e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
36287e8381f9SStefano Zampini   A->nonzerostate++;
36297e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
36307e8381f9SStefano Zampini   {
36317e8381f9SStefano Zampini     matrix = (CsrMatrix*)cusp->mat->mat;
36327e8381f9SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
36337e8381f9SStefano Zampini     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
36347e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
36357e8381f9SStefano Zampini   }
36367e8381f9SStefano Zampini 
36377e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
36387e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
36397e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
36407e8381f9SStefano Zampini   PetscFunctionReturn(0);
36417e8381f9SStefano Zampini }
3642