xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 039c6fba3b07bebff1e830e82a42905dc1e47ea3)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
686fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
759ae82921SPaul Mullowney 
767f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
8357181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
8557181aedSStefano Zampini 
867e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
887e8381f9SStefano Zampini 
89b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
90b06137fdSPaul Mullowney {
91b06137fdSPaul Mullowney   cusparseStatus_t   stat;
92b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
93b06137fdSPaul Mullowney 
94b06137fdSPaul Mullowney   PetscFunctionBegin;
95d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
96b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
9757d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
98b06137fdSPaul Mullowney   PetscFunctionReturn(0);
99b06137fdSPaul Mullowney }
100b06137fdSPaul Mullowney 
101b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
102b06137fdSPaul Mullowney {
103b06137fdSPaul Mullowney   cusparseStatus_t   stat;
104b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
105b06137fdSPaul Mullowney 
106b06137fdSPaul Mullowney   PetscFunctionBegin;
107d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1086b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10916a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11057d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11116a2e217SAlejandro Lamas Daviña     }
112b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1136b1cf21dSAlejandro Lamas Daviña   }
11457d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
115b06137fdSPaul Mullowney   PetscFunctionReturn(0);
116b06137fdSPaul Mullowney }
117b06137fdSPaul Mullowney 
118b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
119b06137fdSPaul Mullowney {
120b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1217e8381f9SStefano Zampini   PetscBool          flg;
1227e8381f9SStefano Zampini   PetscErrorCode     ierr;
123ccdfe979SStefano Zampini 
124b06137fdSPaul Mullowney   PetscFunctionBegin;
1257e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1267e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
127ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
128b06137fdSPaul Mullowney   PetscFunctionReturn(0);
129b06137fdSPaul Mullowney }
130b06137fdSPaul Mullowney 
131ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1329ae82921SPaul Mullowney {
1339ae82921SPaul Mullowney   PetscFunctionBegin;
1349ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1359ae82921SPaul Mullowney   PetscFunctionReturn(0);
1369ae82921SPaul Mullowney }
1379ae82921SPaul Mullowney 
138c708e6cdSJed Brown /*MC
139087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
140087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
141087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
142087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
143087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
144087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
145c708e6cdSJed Brown 
1469ae82921SPaul Mullowney   Level: beginner
147c708e6cdSJed Brown 
1483ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
149c708e6cdSJed Brown M*/
1509ae82921SPaul Mullowney 
15142c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1529ae82921SPaul Mullowney {
1539ae82921SPaul Mullowney   PetscErrorCode ierr;
154bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1559ae82921SPaul Mullowney 
1569ae82921SPaul Mullowney   PetscFunctionBegin;
157bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
158bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1592c7c0729SBarry Smith   (*B)->factortype = ftype;
1602c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1619ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1622205254eSKarl Rupp 
163087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16433d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1659ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1669ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
167087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
168087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
169087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1709ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
171bc3f50f2SPaul Mullowney 
172fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1733ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1749ae82921SPaul Mullowney   PetscFunctionReturn(0);
1759ae82921SPaul Mullowney }
1769ae82921SPaul Mullowney 
177bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
178ca45077fSPaul Mullowney {
179aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1806e111a19SKarl Rupp 
181ca45077fSPaul Mullowney   PetscFunctionBegin;
182ca45077fSPaul Mullowney   switch (op) {
183e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
184aa372e3fSPaul Mullowney     cusparsestruct->format = format;
185ca45077fSPaul Mullowney     break;
186e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
187aa372e3fSPaul Mullowney     cusparsestruct->format = format;
188ca45077fSPaul Mullowney     break;
189ca45077fSPaul Mullowney   default:
19036d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
191ca45077fSPaul Mullowney   }
192ca45077fSPaul Mullowney   PetscFunctionReturn(0);
193ca45077fSPaul Mullowney }
1949ae82921SPaul Mullowney 
195e057df02SPaul Mullowney /*@
196e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
197e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
198aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
199e057df02SPaul Mullowney    Not Collective
200e057df02SPaul Mullowney 
201e057df02SPaul Mullowney    Input Parameters:
2028468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20336d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2042692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
205e057df02SPaul Mullowney 
206e057df02SPaul Mullowney    Output Parameter:
207e057df02SPaul Mullowney 
208e057df02SPaul Mullowney    Level: intermediate
209e057df02SPaul Mullowney 
2108468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
211e057df02SPaul Mullowney @*/
212e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
213e057df02SPaul Mullowney {
214e057df02SPaul Mullowney   PetscErrorCode ierr;
2156e111a19SKarl Rupp 
216e057df02SPaul Mullowney   PetscFunctionBegin;
217e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
218e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
219e057df02SPaul Mullowney   PetscFunctionReturn(0);
220e057df02SPaul Mullowney }
221e057df02SPaul Mullowney 
222e6e9a74fSStefano Zampini /*@
223e589036eSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
224e6e9a74fSStefano Zampini 
225e6e9a74fSStefano Zampini    Collective on mat
226e6e9a74fSStefano Zampini 
227e6e9a74fSStefano Zampini    Input Parameters:
228e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
229e6e9a74fSStefano Zampini -  transgen - the boolean flag
230e6e9a74fSStefano Zampini 
231e6e9a74fSStefano Zampini    Level: intermediate
232e6e9a74fSStefano Zampini 
233e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
234e6e9a74fSStefano Zampini @*/
235e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
236e6e9a74fSStefano Zampini {
237e6e9a74fSStefano Zampini   PetscErrorCode ierr;
238e6e9a74fSStefano Zampini   PetscBool      flg;
239e6e9a74fSStefano Zampini 
240e6e9a74fSStefano Zampini   PetscFunctionBegin;
241e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
242e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
243e6e9a74fSStefano Zampini   if (flg) {
244e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24554da937aSStefano Zampini 
246e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
247e6e9a74fSStefano Zampini     cusp->transgen = transgen;
24854da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
24954da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
25054da937aSStefano Zampini     }
251e6e9a74fSStefano Zampini   }
252e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
253e6e9a74fSStefano Zampini }
254e6e9a74fSStefano Zampini 
2554416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2569ae82921SPaul Mullowney {
2579ae82921SPaul Mullowney   PetscErrorCode           ierr;
258e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2599ae82921SPaul Mullowney   PetscBool                flg;
260a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2616e111a19SKarl Rupp 
2629ae82921SPaul Mullowney   PetscFunctionBegin;
263e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2649ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26554da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26654da937aSStefano Zampini 
26754da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
268afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
269afb2bd1cSJunchao Zhang 
270e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
271a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
273afb2bd1cSJunchao Zhang 
2744c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
275a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
276afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
277afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
278afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
279afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
280afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
281afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
282afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
283afb2bd1cSJunchao Zhang 
284afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
285afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
286afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
287afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
288afb2bd1cSJunchao Zhang 
289afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
290afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
291afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
292afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
293afb2bd1cSJunchao Zhang    #endif
2944c87dfd4SPaul Mullowney   }
2950af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2969ae82921SPaul Mullowney   PetscFunctionReturn(0);
2979ae82921SPaul Mullowney }
2989ae82921SPaul Mullowney 
2996fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3009ae82921SPaul Mullowney {
301da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3029ae82921SPaul Mullowney   PetscErrorCode               ierr;
3039ae82921SPaul Mullowney 
3049ae82921SPaul Mullowney   PetscFunctionBegin;
305da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3069ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3079ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3089ae82921SPaul Mullowney   PetscFunctionReturn(0);
3099ae82921SPaul Mullowney }
3109ae82921SPaul Mullowney 
3116fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3129ae82921SPaul Mullowney {
313da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3149ae82921SPaul Mullowney   PetscErrorCode               ierr;
3159ae82921SPaul Mullowney 
3169ae82921SPaul Mullowney   PetscFunctionBegin;
317da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3189ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3199ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3209ae82921SPaul Mullowney   PetscFunctionReturn(0);
3219ae82921SPaul Mullowney }
3229ae82921SPaul Mullowney 
323087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
324087f3262SPaul Mullowney {
325da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
326087f3262SPaul Mullowney   PetscErrorCode               ierr;
327087f3262SPaul Mullowney 
328087f3262SPaul Mullowney   PetscFunctionBegin;
329da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
330087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
331087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
332087f3262SPaul Mullowney   PetscFunctionReturn(0);
333087f3262SPaul Mullowney }
334087f3262SPaul Mullowney 
335087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
336087f3262SPaul Mullowney {
337da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
338087f3262SPaul Mullowney   PetscErrorCode               ierr;
339087f3262SPaul Mullowney 
340087f3262SPaul Mullowney   PetscFunctionBegin;
341da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
342087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
343087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
344087f3262SPaul Mullowney   PetscFunctionReturn(0);
345087f3262SPaul Mullowney }
346087f3262SPaul Mullowney 
347087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3489ae82921SPaul Mullowney {
3499ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3509ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3519ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
352aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3539ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3549ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3559ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3569ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3579ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
358b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
35957d48284SJunchao Zhang   cudaError_t                       cerr;
3609ae82921SPaul Mullowney 
3619ae82921SPaul Mullowney   PetscFunctionBegin;
362cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
363c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3649ae82921SPaul Mullowney     try {
3659ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3669ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
367da79fbbcSStefano Zampini       if (!loTriFactor) {
3682cbc15d9SMark         PetscScalar                       *AALo;
3692cbc15d9SMark 
3702cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3719ae82921SPaul Mullowney 
3729ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
37357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3759ae82921SPaul Mullowney 
3769ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3779ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3789ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3799ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3809ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3819ae82921SPaul Mullowney         v        = aa;
3829ae82921SPaul Mullowney         vi       = aj;
3839ae82921SPaul Mullowney         offset   = 1;
3849ae82921SPaul Mullowney         rowOffset= 1;
3859ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3869ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
387e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3889ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3899ae82921SPaul Mullowney           rowOffset += nz+1;
3909ae82921SPaul Mullowney 
391580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
392580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney           offset      += nz;
3959ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3969ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3979ae82921SPaul Mullowney           offset      += 1;
3989ae82921SPaul Mullowney 
3999ae82921SPaul Mullowney           v  += nz;
4009ae82921SPaul Mullowney           vi += nz;
4019ae82921SPaul Mullowney         }
4022205254eSKarl Rupp 
403aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
404da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
405da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
406aa372e3fSPaul Mullowney         /* Create the matrix description */
40757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
40857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4091b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
410afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
411afb2bd1cSJunchao Zhang        #else
41257d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
413afb2bd1cSJunchao Zhang        #endif
41457d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41557d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
416aa372e3fSPaul Mullowney 
417aa372e3fSPaul Mullowney         /* set the operation */
418aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
419aa372e3fSPaul Mullowney 
420aa372e3fSPaul Mullowney         /* set the matrix */
421aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
422aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
423aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
424aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
425aa372e3fSPaul Mullowney 
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
427aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
428aa372e3fSPaul Mullowney 
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
430aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
431aa372e3fSPaul Mullowney 
432aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
433aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
434aa372e3fSPaul Mullowney 
435afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
436da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
437afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4381b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
439afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
440afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
441afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
442afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
443afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
444afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
445afb2bd1cSJunchao Zhang       #endif
446afb2bd1cSJunchao Zhang 
447aa372e3fSPaul Mullowney         /* perform the solve analysis */
448aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
449aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
450aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
451afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4521b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
453afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
454afb2bd1cSJunchao Zhang                                #endif
455afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
456da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
457da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
458aa372e3fSPaul Mullowney 
459da79fbbcSStefano Zampini         /* assign the pointer */
460aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4612cbc15d9SMark         loTriFactor->AA_h = AALo;
46257d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
46357d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4644863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
465da79fbbcSStefano Zampini       } else { /* update values only */
4662cbc15d9SMark         if (!loTriFactor->AA_h) {
4672cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4682cbc15d9SMark         }
469da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4702cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
471da79fbbcSStefano Zampini         v        = aa;
472da79fbbcSStefano Zampini         vi       = aj;
473da79fbbcSStefano Zampini         offset   = 1;
474da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
475da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4762cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
477da79fbbcSStefano Zampini           offset      += nz;
4782cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
479da79fbbcSStefano Zampini           offset      += 1;
480da79fbbcSStefano Zampini           v  += nz;
481da79fbbcSStefano Zampini         }
4822cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
483da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484da79fbbcSStefano Zampini       }
4859ae82921SPaul Mullowney     } catch(char *ex) {
4869ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4879ae82921SPaul Mullowney     }
4889ae82921SPaul Mullowney   }
4899ae82921SPaul Mullowney   PetscFunctionReturn(0);
4909ae82921SPaul Mullowney }
4919ae82921SPaul Mullowney 
492087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4939ae82921SPaul Mullowney {
4949ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4959ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4969ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
497aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4989ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4999ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5009ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5019ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5029ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5039ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50457d48284SJunchao Zhang   cudaError_t                       cerr;
5059ae82921SPaul Mullowney 
5069ae82921SPaul Mullowney   PetscFunctionBegin;
507cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
508c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5099ae82921SPaul Mullowney     try {
5109ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5119ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
512da79fbbcSStefano Zampini       if (!upTriFactor) {
5132cbc15d9SMark         PetscScalar *AAUp;
5142cbc15d9SMark 
5152cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5162cbc15d9SMark 
5179ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
51857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
51957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5209ae82921SPaul Mullowney 
5219ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5229ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5239ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5249ae82921SPaul Mullowney         offset = nzUpper;
5259ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5269ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5279ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5289ae82921SPaul Mullowney 
529e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5309ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5319ae82921SPaul Mullowney 
532e057df02SPaul Mullowney           /* decrement the offset */
5339ae82921SPaul Mullowney           offset -= (nz+1);
5349ae82921SPaul Mullowney 
535e057df02SPaul Mullowney           /* first, set the diagonal elements */
5369ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
53709f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5389ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5399ae82921SPaul Mullowney 
540580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
541580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5429ae82921SPaul Mullowney         }
5432205254eSKarl Rupp 
544aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
545da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
546da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5472205254eSKarl Rupp 
548aa372e3fSPaul Mullowney         /* Create the matrix description */
54957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
55057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5511b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
552afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
553afb2bd1cSJunchao Zhang        #else
55457d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
555afb2bd1cSJunchao Zhang        #endif
55657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
55757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
558aa372e3fSPaul Mullowney 
559aa372e3fSPaul Mullowney         /* set the operation */
560aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
561aa372e3fSPaul Mullowney 
562aa372e3fSPaul Mullowney         /* set the matrix */
563aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
564aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
565aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
566aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
567aa372e3fSPaul Mullowney 
568aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
569aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
570aa372e3fSPaul Mullowney 
571aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
572aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
573aa372e3fSPaul Mullowney 
574aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
575aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
576aa372e3fSPaul Mullowney 
577afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
578da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
579afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5801b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
581afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
582afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
585afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
586afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
587afb2bd1cSJunchao Zhang       #endif
588afb2bd1cSJunchao Zhang 
589aa372e3fSPaul Mullowney         /* perform the solve analysis */
590aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
591aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
592aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
593afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5941b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
595afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
596afb2bd1cSJunchao Zhang                                #endif
597afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
598da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
599da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
600aa372e3fSPaul Mullowney 
601da79fbbcSStefano Zampini         /* assign the pointer */
602aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6032cbc15d9SMark         upTriFactor->AA_h = AAUp;
60457d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
60557d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6064863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
607da79fbbcSStefano Zampini       } else {
6082cbc15d9SMark         if (!upTriFactor->AA_h) {
6092cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6102cbc15d9SMark         }
611da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
612da79fbbcSStefano Zampini         offset = nzUpper;
613da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
614da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
615da79fbbcSStefano Zampini 
616da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
617da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
618da79fbbcSStefano Zampini 
619da79fbbcSStefano Zampini           /* decrement the offset */
620da79fbbcSStefano Zampini           offset -= (nz+1);
621da79fbbcSStefano Zampini 
622da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6232cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6242cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
625da79fbbcSStefano Zampini         }
6262cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
627da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
628da79fbbcSStefano Zampini       }
6299ae82921SPaul Mullowney     } catch(char *ex) {
6309ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6319ae82921SPaul Mullowney     }
6329ae82921SPaul Mullowney   }
6339ae82921SPaul Mullowney   PetscFunctionReturn(0);
6349ae82921SPaul Mullowney }
6359ae82921SPaul Mullowney 
636087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6379ae82921SPaul Mullowney {
6389ae82921SPaul Mullowney   PetscErrorCode               ierr;
6399ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6409ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6419ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6429ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6439ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6449ae82921SPaul Mullowney 
6459ae82921SPaul Mullowney   PetscFunctionBegin;
646da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
647087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
648087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6492205254eSKarl Rupp 
650da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
651aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6529ae82921SPaul Mullowney 
653c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
654e057df02SPaul Mullowney   /* lower triangular indices */
6559ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
656da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
657da79fbbcSStefano Zampini     const PetscInt *r;
658da79fbbcSStefano Zampini 
659da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
660aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
661aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6629ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
663da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
664da79fbbcSStefano Zampini   }
6659ae82921SPaul Mullowney 
666e057df02SPaul Mullowney   /* upper triangular indices */
6679ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
668da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
669da79fbbcSStefano Zampini     const PetscInt *c;
670da79fbbcSStefano Zampini 
671da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
672aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
673aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6749ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
675da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
676da79fbbcSStefano Zampini   }
6779ae82921SPaul Mullowney   PetscFunctionReturn(0);
6789ae82921SPaul Mullowney }
6799ae82921SPaul Mullowney 
680087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
681087f3262SPaul Mullowney {
682087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
683087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
684aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
685aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
686087f3262SPaul Mullowney   cusparseStatus_t                  stat;
687087f3262SPaul Mullowney   PetscErrorCode                    ierr;
68857d48284SJunchao Zhang   cudaError_t                       cerr;
689087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
690087f3262SPaul Mullowney   PetscScalar                       *AAUp;
691087f3262SPaul Mullowney   PetscScalar                       *AALo;
692087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
693087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
694087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
695087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
696087f3262SPaul Mullowney 
697087f3262SPaul Mullowney   PetscFunctionBegin;
698cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
699c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
700087f3262SPaul Mullowney     try {
701da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
702da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
703da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
704087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
70557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
70657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
707087f3262SPaul Mullowney 
708087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
709087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
710087f3262SPaul Mullowney         AiUp[n]=nzUpper;
711087f3262SPaul Mullowney         offset = 0;
712087f3262SPaul Mullowney         for (i=0; i<n; i++) {
713087f3262SPaul Mullowney           /* set the pointers */
714087f3262SPaul Mullowney           v  = aa + ai[i];
715087f3262SPaul Mullowney           vj = aj + ai[i];
716087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
717087f3262SPaul Mullowney 
718087f3262SPaul Mullowney           /* first, set the diagonal elements */
719087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
72009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
721087f3262SPaul Mullowney           AiUp[i]      = offset;
72209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
723087f3262SPaul Mullowney 
724087f3262SPaul Mullowney           offset+=1;
725087f3262SPaul Mullowney           if (nz>0) {
726f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
727580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
728087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
729087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
730087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
731087f3262SPaul Mullowney             }
732087f3262SPaul Mullowney             offset+=nz;
733087f3262SPaul Mullowney           }
734087f3262SPaul Mullowney         }
735087f3262SPaul Mullowney 
736aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
737da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
738da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
739087f3262SPaul Mullowney 
740aa372e3fSPaul Mullowney         /* Create the matrix description */
74157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
74257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7431b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
744afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
745afb2bd1cSJunchao Zhang        #else
74657d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
747afb2bd1cSJunchao Zhang        #endif
74857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
74957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
750087f3262SPaul Mullowney 
751aa372e3fSPaul Mullowney         /* set the matrix */
752aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
753aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
754aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
755aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
756aa372e3fSPaul Mullowney 
757aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
758aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
759aa372e3fSPaul Mullowney 
760aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
761aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
762aa372e3fSPaul Mullowney 
763aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
764aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
765aa372e3fSPaul Mullowney 
766afb2bd1cSJunchao Zhang         /* set the operation */
767afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
768afb2bd1cSJunchao Zhang 
769afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
770da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
771afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7721b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
773afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
774afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
775afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
776afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
777afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
778afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
779afb2bd1cSJunchao Zhang       #endif
780afb2bd1cSJunchao Zhang 
781aa372e3fSPaul Mullowney         /* perform the solve analysis */
782aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
783aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
784aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
785afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7861b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
787afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
788afb2bd1cSJunchao Zhang                                 #endif
789afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
790da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
791da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
792aa372e3fSPaul Mullowney 
793da79fbbcSStefano Zampini         /* assign the pointer */
794aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
795aa372e3fSPaul Mullowney 
796aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
797da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
798da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
799aa372e3fSPaul Mullowney 
800aa372e3fSPaul Mullowney         /* Create the matrix description */
80157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
80257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8031b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
804afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
805afb2bd1cSJunchao Zhang        #else
80657d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
807afb2bd1cSJunchao Zhang        #endif
80857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
80957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
810aa372e3fSPaul Mullowney 
811aa372e3fSPaul Mullowney         /* set the operation */
812aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
813aa372e3fSPaul Mullowney 
814aa372e3fSPaul Mullowney         /* set the matrix */
815aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
816aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
817aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
818aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
819aa372e3fSPaul Mullowney 
820aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
821aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
822aa372e3fSPaul Mullowney 
823aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
824aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
825aa372e3fSPaul Mullowney 
826aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
827aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
828aa372e3fSPaul Mullowney 
829afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
830da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
831afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8321b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
833afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
834afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
835afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
836afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
837afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
838afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
839afb2bd1cSJunchao Zhang       #endif
840afb2bd1cSJunchao Zhang 
841aa372e3fSPaul Mullowney         /* perform the solve analysis */
842aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
843aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
844aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
845afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8461b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
847afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
848afb2bd1cSJunchao Zhang                                 #endif
849afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
850da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
851da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
852aa372e3fSPaul Mullowney 
853da79fbbcSStefano Zampini         /* assign the pointer */
854aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
855087f3262SPaul Mullowney 
856da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
85757d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
85857d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
859da79fbbcSStefano Zampini       } else {
860da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
861da79fbbcSStefano Zampini         offset = 0;
862da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
863da79fbbcSStefano Zampini           /* set the pointers */
864da79fbbcSStefano Zampini           v  = aa + ai[i];
865da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
866da79fbbcSStefano Zampini 
867da79fbbcSStefano Zampini           /* first, set the diagonal elements */
868da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
869da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
870da79fbbcSStefano Zampini 
871da79fbbcSStefano Zampini           offset+=1;
872da79fbbcSStefano Zampini           if (nz>0) {
873da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
874da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
875da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
876da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
877da79fbbcSStefano Zampini             }
878da79fbbcSStefano Zampini             offset+=nz;
879da79fbbcSStefano Zampini           }
880da79fbbcSStefano Zampini         }
881da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
882da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
883da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
884da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
885da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
886da79fbbcSStefano Zampini       }
88757d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
88857d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
889087f3262SPaul Mullowney     } catch(char *ex) {
890087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
891087f3262SPaul Mullowney     }
892087f3262SPaul Mullowney   }
893087f3262SPaul Mullowney   PetscFunctionReturn(0);
894087f3262SPaul Mullowney }
895087f3262SPaul Mullowney 
896087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8979ae82921SPaul Mullowney {
8989ae82921SPaul Mullowney   PetscErrorCode               ierr;
899087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
900087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
901087f3262SPaul Mullowney   IS                           ip = a->row;
902087f3262SPaul Mullowney   PetscBool                    perm_identity;
903087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
904087f3262SPaul Mullowney 
905087f3262SPaul Mullowney   PetscFunctionBegin;
906da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
908da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
909aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
910aa372e3fSPaul Mullowney 
911da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
912da79fbbcSStefano Zampini 
913087f3262SPaul Mullowney   /* lower triangular indices */
914087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
915087f3262SPaul Mullowney   if (!perm_identity) {
9164e4bbfaaSStefano Zampini     IS             iip;
917da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9184e4bbfaaSStefano Zampini 
9194e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9204e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
921da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
922aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
923aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
924aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9254e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9264e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9274e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
928087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
929da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
930da79fbbcSStefano Zampini   }
931087f3262SPaul Mullowney   PetscFunctionReturn(0);
932087f3262SPaul Mullowney }
933087f3262SPaul Mullowney 
9346fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9359ae82921SPaul Mullowney {
9369ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9379ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9389ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
939b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9409ae82921SPaul Mullowney 
9419ae82921SPaul Mullowney   PetscFunctionBegin;
94257181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
9439ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
944ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
945e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9469ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9479ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
948bda325fcSPaul Mullowney   if (row_identity && col_identity) {
949bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
950bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9514e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9524e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
953bda325fcSPaul Mullowney   } else {
954bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
955bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9564e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9574e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
958bda325fcSPaul Mullowney   }
9598dc1d2a3SPaul Mullowney 
960e057df02SPaul Mullowney   /* get the triangular factors */
961087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9629ae82921SPaul Mullowney   PetscFunctionReturn(0);
9639ae82921SPaul Mullowney }
9649ae82921SPaul Mullowney 
965087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
966087f3262SPaul Mullowney {
967087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
968087f3262SPaul Mullowney   IS             ip = b->row;
969087f3262SPaul Mullowney   PetscBool      perm_identity;
970b175d8bbSPaul Mullowney   PetscErrorCode ierr;
971087f3262SPaul Mullowney 
972087f3262SPaul Mullowney   PetscFunctionBegin;
97357181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
974087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
975ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
976087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
977087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
978087f3262SPaul Mullowney   if (perm_identity) {
979087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
980087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9814e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9824e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
983087f3262SPaul Mullowney   } else {
984087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
985087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9864e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9874e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
988087f3262SPaul Mullowney   }
989087f3262SPaul Mullowney 
990087f3262SPaul Mullowney   /* get the triangular factors */
991087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
992087f3262SPaul Mullowney   PetscFunctionReturn(0);
993087f3262SPaul Mullowney }
9949ae82921SPaul Mullowney 
995b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
996bda325fcSPaul Mullowney {
997bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
998aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
999aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1000da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1001da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1002bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1003aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1004aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1005aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1006aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10071b0a6780SStefano Zampini   cudaError_t                       cerr;
1008da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1009b175d8bbSPaul Mullowney 
1010bda325fcSPaul Mullowney   PetscFunctionBegin;
1011aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1012da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1013da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1014aa372e3fSPaul Mullowney 
1015aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1016aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1017aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1018aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1019aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1020aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1021aa372e3fSPaul Mullowney 
1022aa372e3fSPaul Mullowney   /* Create the matrix description */
102357d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102457d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102557d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102657d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102757d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1028aa372e3fSPaul Mullowney 
1029aa372e3fSPaul Mullowney   /* set the operation */
1030aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1031aa372e3fSPaul Mullowney 
1032aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1033aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1034afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1035afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1036aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1037afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1038afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1039afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1040aa372e3fSPaul Mullowney 
1041aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1042afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1043afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1046afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1047afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1048afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1049afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1050afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1051afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10521b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1053afb2bd1cSJunchao Zhang #endif
1054afb2bd1cSJunchao Zhang 
1055da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1056aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1057aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1058aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1059aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1060aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1061aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1062afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1063afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1064afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1065afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1066afb2bd1cSJunchao Zhang                         #else
1067afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1069afb2bd1cSJunchao Zhang                         #endif
1070afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1071da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1072da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1073aa372e3fSPaul Mullowney 
1074afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1075da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1076afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10771b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1078afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1079afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1080afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1081afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1082afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1083afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1084afb2bd1cSJunchao Zhang #endif
1085afb2bd1cSJunchao Zhang 
1086afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1087aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1088afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1089afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1090afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10911b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1092afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1093afb2bd1cSJunchao Zhang                           #endif
1094afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1095da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1096da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1097aa372e3fSPaul Mullowney 
1098da79fbbcSStefano Zampini   /* assign the pointer */
1099aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1100aa372e3fSPaul Mullowney 
1101aa372e3fSPaul Mullowney   /*********************************************/
1102aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1103aa372e3fSPaul Mullowney   /*********************************************/
1104aa372e3fSPaul Mullowney 
1105aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1106da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1107da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1108aa372e3fSPaul Mullowney 
1109aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1110aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1111aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1112aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1113aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1114aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1115aa372e3fSPaul Mullowney 
1116aa372e3fSPaul Mullowney   /* Create the matrix description */
111757d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
111857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
111957d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112057d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112157d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1122aa372e3fSPaul Mullowney 
1123aa372e3fSPaul Mullowney   /* set the operation */
1124aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1125aa372e3fSPaul Mullowney 
1126aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1127aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1128afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1129afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1130aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1131afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1132afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1133afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1134aa372e3fSPaul Mullowney 
1135aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1136afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1137afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1138afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1140afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1141afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1142afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1143afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1144afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1145afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1146afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1147afb2bd1cSJunchao Zhang #endif
1148afb2bd1cSJunchao Zhang 
1149da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1150aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1151aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1152aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1153aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1154aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1155aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1156afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1157afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1158afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1159afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1160afb2bd1cSJunchao Zhang                         #else
1161afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1163afb2bd1cSJunchao Zhang                         #endif
1164afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1165da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1166da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1167aa372e3fSPaul Mullowney 
1168afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1169da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1170afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11711b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1172afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1173afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1174afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1175afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1176afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1177afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1178afb2bd1cSJunchao Zhang   #endif
1179afb2bd1cSJunchao Zhang 
1180afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1181aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1182afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1183afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1184afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11851b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1186afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1187afb2bd1cSJunchao Zhang                           #endif
1188afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1189da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1190da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1191aa372e3fSPaul Mullowney 
1192da79fbbcSStefano Zampini   /* assign the pointer */
1193aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1194bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1195bda325fcSPaul Mullowney }
1196bda325fcSPaul Mullowney 
1197b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1198bda325fcSPaul Mullowney {
1199aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1200aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1201aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1202bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1203bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1204aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1205b06137fdSPaul Mullowney   cudaError_t                  err;
120685ba7357SStefano Zampini   PetscErrorCode               ierr;
1207b175d8bbSPaul Mullowney 
1208bda325fcSPaul Mullowney   PetscFunctionBegin;
1209fcdce8c4SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
121085ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
121185ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
121285ba7357SStefano Zampini   /* create cusparse matrix */
1213aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
121457d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1215aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
121657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
121757d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1218aa372e3fSPaul Mullowney 
1219b06137fdSPaul Mullowney   /* set alpha and beta */
1220afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12217656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12227656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1223afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12247656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12257656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
122657d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1227b06137fdSPaul Mullowney 
1228aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1229aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1230aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1231554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1232554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1233aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1234a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1235aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1236aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1237a3fdcf43SKarl Rupp 
1238*039c6fbaSStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
123981902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1240afb2bd1cSJunchao Zhang 
124181902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1242afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1243afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1244afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1245afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1246afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1247afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1248afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1249afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1251afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1252afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1253afb2bd1cSJunchao Zhang    #endif
1254afb2bd1cSJunchao Zhang 
1255a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1256a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1257aa372e3fSPaul Mullowney                             matrix->values->data().get(),
125881902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1259aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1260aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1261afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1262afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1263afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1264afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1265afb2bd1cSJunchao Zhang                           #else
1266afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1267afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1268afb2bd1cSJunchao Zhang                           #endif
1269afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1270aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1271afb2bd1cSJunchao Zhang 
1272afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1273afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1274afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1275afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1276afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1277afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1278afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1279afb2bd1cSJunchao Zhang    #endif
1280aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1281afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1282afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1283afb2bd1cSJunchao Zhang    #else
1284aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
128551c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
128651c6d536SStefano Zampini     /* First convert HYB to CSR */
1287aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1288aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1289aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1290aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1291aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1292aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1293aa372e3fSPaul Mullowney 
1294aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1295aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1296aa372e3fSPaul Mullowney                             temp->values->data().get(),
1297aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
129857d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1299aa372e3fSPaul Mullowney 
1300aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1301aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1302aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1303aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1304aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1305aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1306aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1307aa372e3fSPaul Mullowney 
1308aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1309aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1310aa372e3fSPaul Mullowney                             temp->values->data().get(),
1311aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1312aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1313aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1314aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1315aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
131657d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1317aa372e3fSPaul Mullowney 
1318aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1319aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
132057d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1321aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1322aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1323aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1324aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1325aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1326aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
132757d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1328aa372e3fSPaul Mullowney 
1329aa372e3fSPaul Mullowney     /* assign the pointer */
1330aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1331aa372e3fSPaul Mullowney     /* delete temporaries */
1332aa372e3fSPaul Mullowney     if (tempT) {
1333aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1334aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1335aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1336aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1337087f3262SPaul Mullowney     }
1338aa372e3fSPaul Mullowney     if (temp) {
1339aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1340aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1341aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1342aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1343aa372e3fSPaul Mullowney     }
1344afb2bd1cSJunchao Zhang    #endif
1345aa372e3fSPaul Mullowney   }
134605035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
134785ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
134885ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1349213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1350213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1351aa372e3fSPaul Mullowney   /* assign the pointer */
1352aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1353bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1354bda325fcSPaul Mullowney }
1355bda325fcSPaul Mullowney 
13564e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13576fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1358bda325fcSPaul Mullowney {
1359c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1360465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1361465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1362465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1363465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1364bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1365bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1366aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1367aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1368aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1369b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
137057d48284SJunchao Zhang   cudaError_t                           cerr;
1371bda325fcSPaul Mullowney 
1372bda325fcSPaul Mullowney   PetscFunctionBegin;
1373aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1374aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1375bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1376aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1377aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1378bda325fcSPaul Mullowney   }
1379bda325fcSPaul Mullowney 
1380bda325fcSPaul Mullowney   /* Get the GPU pointers */
1381c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1382c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1383c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1384c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1385bda325fcSPaul Mullowney 
13867a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1387aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1388c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1389c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1390c41cb2e2SAlejandro Lamas Daviña                xGPU);
1391aa372e3fSPaul Mullowney 
1392aa372e3fSPaul Mullowney   /* First, solve U */
1393aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1394afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13951b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1396afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1397afb2bd1cSJunchao Zhang                       #endif
1398afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1399aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1400aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1401aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1402aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1403afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14041b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1405afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1406afb2bd1cSJunchao Zhang                       #endif
1407afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1408aa372e3fSPaul Mullowney 
1409aa372e3fSPaul Mullowney   /* Then, solve L */
1410aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1411afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14121b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1413afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1414afb2bd1cSJunchao Zhang                       #endif
1415afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1416aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1417aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1418aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1419aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1420afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14211b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1422afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1423afb2bd1cSJunchao Zhang                       #endif
1424afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1425aa372e3fSPaul Mullowney 
1426aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1427c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1428c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1429aa372e3fSPaul Mullowney                tempGPU->begin());
1430aa372e3fSPaul Mullowney 
1431aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1432c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1433bda325fcSPaul Mullowney 
1434bda325fcSPaul Mullowney   /* restore */
1435c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1436c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
143705035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1438661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1439958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1440bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1441bda325fcSPaul Mullowney }
1442bda325fcSPaul Mullowney 
14436fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1444bda325fcSPaul Mullowney {
1445465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1446465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1447bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1448bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1449aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1450aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1451aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1452b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
145357d48284SJunchao Zhang   cudaError_t                       cerr;
1454bda325fcSPaul Mullowney 
1455bda325fcSPaul Mullowney   PetscFunctionBegin;
1456aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1457aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1458bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1459aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1460aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1461bda325fcSPaul Mullowney   }
1462bda325fcSPaul Mullowney 
1463bda325fcSPaul Mullowney   /* Get the GPU pointers */
1464c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1465c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1466bda325fcSPaul Mullowney 
14677a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1468aa372e3fSPaul Mullowney   /* First, solve U */
1469aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1470afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14711b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1472afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1473afb2bd1cSJunchao Zhang                       #endif
1474afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1475aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1476aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1477aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1478aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1479afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
14801b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1481afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1482afb2bd1cSJunchao Zhang                       #endif
1483afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1484aa372e3fSPaul Mullowney 
1485aa372e3fSPaul Mullowney   /* Then, solve L */
1486aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1487afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14881b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1489afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1490afb2bd1cSJunchao Zhang                       #endif
1491afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1492aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1493aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1494aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1495aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1496afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14971b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1498afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1499afb2bd1cSJunchao Zhang                       #endif
1500afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1501bda325fcSPaul Mullowney 
1502bda325fcSPaul Mullowney   /* restore */
1503c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1504c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
150505035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1506661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1508bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1509bda325fcSPaul Mullowney }
1510bda325fcSPaul Mullowney 
15116fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15129ae82921SPaul Mullowney {
1513465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1514465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1515465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1516465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15179ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15189ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1519aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1520aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1521aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1522b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
152357d48284SJunchao Zhang   cudaError_t                           cerr;
15249ae82921SPaul Mullowney 
15259ae82921SPaul Mullowney   PetscFunctionBegin;
1526ebc8f436SDominic Meiser 
1527e057df02SPaul Mullowney   /* Get the GPU pointers */
1528c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1529c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1530c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1531c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15329ae82921SPaul Mullowney 
15337a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1534aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1535c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1536c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15374e4bbfaaSStefano Zampini                tempGPU->begin());
1538aa372e3fSPaul Mullowney 
1539aa372e3fSPaul Mullowney   /* Next, solve L */
1540aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1541afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15421b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1544afb2bd1cSJunchao Zhang                       #endif
1545afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1546aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1547aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1548aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1549aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1550afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15511b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1552afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1553afb2bd1cSJunchao Zhang                       #endif
1554afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1555aa372e3fSPaul Mullowney 
1556aa372e3fSPaul Mullowney   /* Then, solve U */
1557aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1558afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15591b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1560afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1561afb2bd1cSJunchao Zhang                       #endif
1562afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1563aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1564aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1565aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1566aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1567afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
15681b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1569afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1570afb2bd1cSJunchao Zhang                       #endif
1571afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1572aa372e3fSPaul Mullowney 
15734e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
15744e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15754e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15764e4bbfaaSStefano Zampini                xGPU);
15779ae82921SPaul Mullowney 
1578c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1579c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
158005035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1581661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1582958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15839ae82921SPaul Mullowney   PetscFunctionReturn(0);
15849ae82921SPaul Mullowney }
15859ae82921SPaul Mullowney 
15866fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15879ae82921SPaul Mullowney {
1588465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1589465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15909ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15919ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1593aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1594aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
159657d48284SJunchao Zhang   cudaError_t                       cerr;
15979ae82921SPaul Mullowney 
15989ae82921SPaul Mullowney   PetscFunctionBegin;
1599e057df02SPaul Mullowney   /* Get the GPU pointers */
1600c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1601c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16029ae82921SPaul Mullowney 
16037a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1604aa372e3fSPaul Mullowney   /* First, solve L */
1605aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1606afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16071b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1609afb2bd1cSJunchao Zhang                       #endif
1610afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1611aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1612aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1613aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1614aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1615afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16161b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1618afb2bd1cSJunchao Zhang                       #endif
1619afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1620aa372e3fSPaul Mullowney 
1621aa372e3fSPaul Mullowney   /* Next, solve U */
1622aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1623afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16241b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1625afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1626afb2bd1cSJunchao Zhang                       #endif
1627afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1628aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1629aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1630aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1631aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1632afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16331b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1635afb2bd1cSJunchao Zhang                       #endif
1636afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16379ae82921SPaul Mullowney 
1638c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1639c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
164005035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1641661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1642958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16439ae82921SPaul Mullowney   PetscFunctionReturn(0);
16449ae82921SPaul Mullowney }
16459ae82921SPaul Mullowney 
16467e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16477e8381f9SStefano Zampini {
16487e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16497e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16507e8381f9SStefano Zampini   cudaError_t        cerr;
16517e8381f9SStefano Zampini   PetscErrorCode     ierr;
16527e8381f9SStefano Zampini 
16537e8381f9SStefano Zampini   PetscFunctionBegin;
16547e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16557e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16567e8381f9SStefano Zampini 
16577e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16587e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16597e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16607e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
16617e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16627e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
16637e8381f9SStefano Zampini   }
16647e8381f9SStefano Zampini   PetscFunctionReturn(0);
16657e8381f9SStefano Zampini }
16667e8381f9SStefano Zampini 
16677e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
16687e8381f9SStefano Zampini {
16697e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
16707e8381f9SStefano Zampini   PetscErrorCode ierr;
16717e8381f9SStefano Zampini 
16727e8381f9SStefano Zampini   PetscFunctionBegin;
16737e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
16747e8381f9SStefano Zampini   *array = a->a;
16757e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
16767e8381f9SStefano Zampini   PetscFunctionReturn(0);
16777e8381f9SStefano Zampini }
16787e8381f9SStefano Zampini 
16796fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
16809ae82921SPaul Mullowney {
1681aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
16827c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
16839ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1684213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
16859ae82921SPaul Mullowney   PetscErrorCode               ierr;
1686aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1687abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1688b06137fdSPaul Mullowney   cudaError_t                  err;
16899ae82921SPaul Mullowney 
16909ae82921SPaul Mullowney   PetscFunctionBegin;
1691fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1692c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1693fcdce8c4SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
169481902715SJunchao Zhang       /* Copy values only */
1695afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1696afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
169785ba7357SStefano Zampini 
1698abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
169985ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1700afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
170105035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17024863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
170385ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
170481902715SJunchao Zhang 
170581902715SJunchao Zhang       /* Update matT when it was built before */
170681902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
170781902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1708afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
170985ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
171081902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1711afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1712afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
171381902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1714afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1715afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1716afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1717afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1718afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1719afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1720afb2bd1cSJunchao Zhang                           #else
1721afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1722afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1723afb2bd1cSJunchao Zhang                           #endif
1724afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
172505035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
172685ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
172781902715SJunchao Zhang       }
172834d6c7a5SJose E. Roman     } else {
1729abb89eb1SStefano Zampini       PetscInt nnz;
173085ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17317c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
17327c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
17337c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
173481902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
17359ae82921SPaul Mullowney       try {
17369ae82921SPaul Mullowney         if (a->compressedrow.use) {
17379ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17389ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17399ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17409ae82921SPaul Mullowney         } else {
1741213423ffSJunchao Zhang           m    = A->rmap->n;
1742213423ffSJunchao Zhang           ii   = a->i;
1743e6e9a74fSStefano Zampini           ridx = NULL;
17449ae82921SPaul Mullowney         }
1745abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1746abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1747abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1748abb89eb1SStefano Zampini         else nnz = a->nz;
17499ae82921SPaul Mullowney 
175085ba7357SStefano Zampini         /* create cusparse matrix */
1751abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1752aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
175357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
175457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
175557d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17569ae82921SPaul Mullowney 
1757afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17587656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17597656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1760afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17617656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17627656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
176357d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1764b06137fdSPaul Mullowney 
1765aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1766aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1767aa372e3fSPaul Mullowney           /* set the matrix */
1768afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1769afb2bd1cSJunchao Zhang           mat->num_rows = m;
1770afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1771abb89eb1SStefano Zampini           mat->num_entries = nnz;
1772afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1773afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17749ae82921SPaul Mullowney 
1775abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1776abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1777aa372e3fSPaul Mullowney 
1778abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1779abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1780aa372e3fSPaul Mullowney 
1781aa372e3fSPaul Mullowney           /* assign the pointer */
1782afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1783afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1784afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1785afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1786afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1787afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1788afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1789afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1790afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1791afb2bd1cSJunchao Zhang           }
1792afb2bd1cSJunchao Zhang          #endif
1793aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1794afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1795afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1796afb2bd1cSJunchao Zhang          #else
1797afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1798afb2bd1cSJunchao Zhang           mat->num_rows = m;
1799afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1800abb89eb1SStefano Zampini           mat->num_entries = nnz;
1801afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1802afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1803aa372e3fSPaul Mullowney 
1804abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1805abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1806aa372e3fSPaul Mullowney 
1807abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1808abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1809aa372e3fSPaul Mullowney 
1810aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
181157d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1812aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1813aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1814afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1815afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1816afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1817afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
181857d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1819aa372e3fSPaul Mullowney           /* assign the pointer */
1820aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1821aa372e3fSPaul Mullowney 
1822afb2bd1cSJunchao Zhang           if (mat) {
1823afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1824afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1825afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1826afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1827087f3262SPaul Mullowney           }
1828afb2bd1cSJunchao Zhang          #endif
1829087f3262SPaul Mullowney         }
1830ca45077fSPaul Mullowney 
1831aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1832213423ffSJunchao Zhang         if (a->compressedrow.use) {
1833213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1834aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1835aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1836213423ffSJunchao Zhang           tmp = m;
1837213423ffSJunchao Zhang         } else {
1838213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1839213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1840213423ffSJunchao Zhang           tmp = 0;
1841213423ffSJunchao Zhang         }
1842213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1843aa372e3fSPaul Mullowney 
1844aa372e3fSPaul Mullowney         /* assign the pointer */
1845aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18469ae82921SPaul Mullowney       } catch(char *ex) {
18479ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18489ae82921SPaul Mullowney       }
184905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
185085ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
185134d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
185234d6c7a5SJose E. Roman     }
1853abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18549ae82921SPaul Mullowney   }
18559ae82921SPaul Mullowney   PetscFunctionReturn(0);
18569ae82921SPaul Mullowney }
18579ae82921SPaul Mullowney 
1858c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1859aa372e3fSPaul Mullowney {
1860aa372e3fSPaul Mullowney   template <typename Tuple>
1861aa372e3fSPaul Mullowney   __host__ __device__
1862aa372e3fSPaul Mullowney   void operator()(Tuple t)
1863aa372e3fSPaul Mullowney   {
1864aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1865aa372e3fSPaul Mullowney   }
1866aa372e3fSPaul Mullowney };
1867aa372e3fSPaul Mullowney 
18687e8381f9SStefano Zampini struct VecCUDAEquals
18697e8381f9SStefano Zampini {
18707e8381f9SStefano Zampini   template <typename Tuple>
18717e8381f9SStefano Zampini   __host__ __device__
18727e8381f9SStefano Zampini   void operator()(Tuple t)
18737e8381f9SStefano Zampini   {
18747e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18757e8381f9SStefano Zampini   }
18767e8381f9SStefano Zampini };
18777e8381f9SStefano Zampini 
1878e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1879e6e9a74fSStefano Zampini {
1880e6e9a74fSStefano Zampini   template <typename Tuple>
1881e6e9a74fSStefano Zampini   __host__ __device__
1882e6e9a74fSStefano Zampini   void operator()(Tuple t)
1883e6e9a74fSStefano Zampini   {
1884e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1885e6e9a74fSStefano Zampini   }
1886e6e9a74fSStefano Zampini };
1887e6e9a74fSStefano Zampini 
1888afb2bd1cSJunchao Zhang struct MatMatCusparse {
1889ccdfe979SStefano Zampini   PetscBool             cisdense;
1890ccdfe979SStefano Zampini   PetscScalar           *Bt;
1891ccdfe979SStefano Zampini   Mat                   X;
1892fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1893fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1894fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1895afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1896fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1897afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1898afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1899afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1900afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1901fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1902fcdce8c4SStefano Zampini   void                  *mmBuffer;
1903fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1904fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1905afb2bd1cSJunchao Zhang #endif
1906afb2bd1cSJunchao Zhang };
1907ccdfe979SStefano Zampini 
1908ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1909ccdfe979SStefano Zampini {
1910ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1911ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1912ccdfe979SStefano Zampini   cudaError_t      cerr;
1913fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1914fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1915fcdce8c4SStefano Zampini  #endif
1916ccdfe979SStefano Zampini 
1917ccdfe979SStefano Zampini   PetscFunctionBegin;
1918ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1919fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1920afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1921fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1922fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1923fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1924afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1925afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1926fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1927afb2bd1cSJunchao Zhang  #endif
1928ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1929ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1930ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1931ccdfe979SStefano Zampini }
1932ccdfe979SStefano Zampini 
1933ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1934ccdfe979SStefano Zampini 
1935ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1936ccdfe979SStefano Zampini {
1937ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1938ccdfe979SStefano Zampini   Mat                          A,B;
1939afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1940ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1941ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1942ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1943ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1944ccdfe979SStefano Zampini   const PetscScalar            *barray;
1945ccdfe979SStefano Zampini   PetscScalar                  *carray;
1946ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1947ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1948ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1949ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1950afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1951ccdfe979SStefano Zampini 
1952ccdfe979SStefano Zampini   PetscFunctionBegin;
1953ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1954ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1955ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1956ccdfe979SStefano Zampini   A    = product->A;
1957ccdfe979SStefano Zampini   B    = product->B;
1958ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1959ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1960ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1961ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1962ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1963ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1964ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1965ccdfe979SStefano Zampini   switch (product->type) {
1966ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1967ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1968ccdfe979SStefano Zampini     mat = cusp->mat;
1969ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1970ccdfe979SStefano Zampini     m   = A->rmap->n;
1971ccdfe979SStefano Zampini     n   = B->cmap->n;
1972ccdfe979SStefano Zampini     break;
1973ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1974e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1975e6e9a74fSStefano Zampini       mat = cusp->mat;
1976e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1977e6e9a74fSStefano Zampini     } else {
1978ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1979ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1980ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1981e6e9a74fSStefano Zampini     }
1982ccdfe979SStefano Zampini     m = A->cmap->n;
1983ccdfe979SStefano Zampini     n = B->cmap->n;
1984ccdfe979SStefano Zampini     break;
1985ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1986ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1987ccdfe979SStefano Zampini     mat = cusp->mat;
1988ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1989ccdfe979SStefano Zampini     m   = A->rmap->n;
1990ccdfe979SStefano Zampini     n   = B->rmap->n;
1991ccdfe979SStefano Zampini     break;
1992ccdfe979SStefano Zampini   default:
1993ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1994ccdfe979SStefano Zampini   }
1995ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1996ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1997ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1998ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1999afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2000ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2001afb2bd1cSJunchao Zhang 
2002ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2003c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2004c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2005c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2006c8378d12SStefano Zampini   } else {
2007c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2008c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2009c8378d12SStefano Zampini   }
2010c8378d12SStefano Zampini 
2011c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2012afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2013afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2014fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2015afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2016fcdce8c4SStefano Zampini     size_t mmBufferSize;
2017afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2018afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2019afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2020afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2021afb2bd1cSJunchao Zhang     }
2022c8378d12SStefano Zampini 
2023afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2024afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2025afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2026afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2027afb2bd1cSJunchao Zhang     }
2028afb2bd1cSJunchao Zhang 
2029afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2030afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2031afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2032afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2033afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2034afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2035afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2036afb2bd1cSJunchao Zhang     }
2037afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2038afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2039afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2040fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2041fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2042fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2043fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2044fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2045fcdce8c4SStefano Zampini     }
2046afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2047afb2bd1cSJunchao Zhang   } else {
2048afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2049afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2050afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2051afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2052afb2bd1cSJunchao Zhang   }
2053afb2bd1cSJunchao Zhang 
2054afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2055afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2056afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2057afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2058fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2059afb2bd1cSJunchao Zhang  #else
2060afb2bd1cSJunchao Zhang   PetscInt k;
2061afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2062ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2063ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2064ccdfe979SStefano Zampini     cublasStatus_t cerr;
2065ccdfe979SStefano Zampini 
2066ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2067ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2068ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2069ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2070ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2071ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2072ccdfe979SStefano Zampini     blda = B->cmap->n;
2073afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2074afb2bd1cSJunchao Zhang   } else {
2075afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2076ccdfe979SStefano Zampini   }
2077ccdfe979SStefano Zampini 
2078afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2079ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2080afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2081ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2082ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2083ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2084ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2085ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2086afb2bd1cSJunchao Zhang  #endif
2087afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2088c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2089c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2090ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2091ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2092ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2093ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2094ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2095ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2096ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2097ccdfe979SStefano Zampini   } else {
2098ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2099ccdfe979SStefano Zampini   }
2100ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2101ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2102ccdfe979SStefano Zampini   }
2103ccdfe979SStefano Zampini   if (!biscuda) {
2104ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2105ccdfe979SStefano Zampini   }
2106ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2107ccdfe979SStefano Zampini }
2108ccdfe979SStefano Zampini 
2109ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2110ccdfe979SStefano Zampini {
2111ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2112ccdfe979SStefano Zampini   Mat                A,B;
2113ccdfe979SStefano Zampini   PetscInt           m,n;
2114ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2115ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2116ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2117ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2118ccdfe979SStefano Zampini 
2119ccdfe979SStefano Zampini   PetscFunctionBegin;
2120ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2121ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2122ccdfe979SStefano Zampini   A    = product->A;
2123ccdfe979SStefano Zampini   B    = product->B;
2124ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2125ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2126ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2127ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2128ccdfe979SStefano Zampini   switch (product->type) {
2129ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2130ccdfe979SStefano Zampini     m = A->rmap->n;
2131ccdfe979SStefano Zampini     n = B->cmap->n;
2132ccdfe979SStefano Zampini     break;
2133ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2134ccdfe979SStefano Zampini     m = A->cmap->n;
2135ccdfe979SStefano Zampini     n = B->cmap->n;
2136ccdfe979SStefano Zampini     break;
2137ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2138ccdfe979SStefano Zampini     m = A->rmap->n;
2139ccdfe979SStefano Zampini     n = B->rmap->n;
2140ccdfe979SStefano Zampini     break;
2141ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2142ccdfe979SStefano Zampini     m = B->cmap->n;
2143ccdfe979SStefano Zampini     n = B->cmap->n;
2144ccdfe979SStefano Zampini     break;
2145ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2146ccdfe979SStefano Zampini     m = B->rmap->n;
2147ccdfe979SStefano Zampini     n = B->rmap->n;
2148ccdfe979SStefano Zampini     break;
2149ccdfe979SStefano Zampini   default:
2150ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2151ccdfe979SStefano Zampini   }
2152ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2153ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2154ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2155ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2156ccdfe979SStefano Zampini 
2157ccdfe979SStefano Zampini   /* product data */
2158ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2159ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2160afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2161afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2162ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2163afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2164ccdfe979SStefano Zampini   }
2165afb2bd1cSJunchao Zhang  #endif
2166ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2167ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2168ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2169ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2170ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2171ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2172ccdfe979SStefano Zampini     } else {
2173ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2174ccdfe979SStefano Zampini     }
2175ccdfe979SStefano Zampini   }
2176ccdfe979SStefano Zampini   C->product->data    = mmdata;
2177ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2178ccdfe979SStefano Zampini 
2179ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2180ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2181ccdfe979SStefano Zampini }
2182ccdfe979SStefano Zampini 
2183fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2184ccdfe979SStefano Zampini {
2185ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2186fcdce8c4SStefano Zampini   Mat                          A,B;
2187fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2188fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2189fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2190fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2191fcdce8c4SStefano Zampini   PetscBool                    flg;
2192ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2193fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2194fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2195fcdce8c4SStefano Zampini   MatProductType               ptype;
2196fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2197fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2198fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2199fcdce8c4SStefano Zampini #endif
2200ccdfe979SStefano Zampini 
2201ccdfe979SStefano Zampini   PetscFunctionBegin;
2202ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2203fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2204fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2205fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2206fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2207fcdce8c4SStefano Zampini   A = product->A;
2208fcdce8c4SStefano Zampini   B = product->B;
2209fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2210fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2211fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2212fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2213fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2214fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2215fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2216fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2217fcdce8c4SStefano Zampini     goto finalize;
2218fcdce8c4SStefano Zampini   }
2219fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2220fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2221fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2222fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2223fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2224fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2225fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2226fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2227fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2228fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2229fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2230fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2231fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2232fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2233fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2234fcdce8c4SStefano Zampini 
2235fcdce8c4SStefano Zampini   ptype = product->type;
2236fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2237fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2238fcdce8c4SStefano Zampini   switch (ptype) {
2239fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2240fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2241fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2242fcdce8c4SStefano Zampini     break;
2243fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2244fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2245fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2246fcdce8c4SStefano Zampini     break;
2247fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2248fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2249fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2250fcdce8c4SStefano Zampini     break;
2251fcdce8c4SStefano Zampini   default:
2252fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2253fcdce8c4SStefano Zampini   }
2254fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2255fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2256fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2257fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2258fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2259fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2260fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2261fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2262fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2263fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2264fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2265fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2266fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2267fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2268fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2269fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2270fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2271fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2272fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2273fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2274fcdce8c4SStefano Zampini #else
2275fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2276fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2277fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2278fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2279fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2280fcdce8c4SStefano Zampini #endif
2281fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2282fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2283fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2284fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2285fcdce8c4SStefano Zampini finalize:
2286fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2287fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2288fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2289fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2290fcdce8c4SStefano Zampini   c->reallocs         = 0;
2291fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2292fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2293fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2294fcdce8c4SStefano Zampini   C->num_ass++;
2295ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2296ccdfe979SStefano Zampini }
2297fcdce8c4SStefano Zampini 
2298fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2299fcdce8c4SStefano Zampini {
2300fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2301fcdce8c4SStefano Zampini   Mat                          A,B;
2302fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2303fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2304fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2305fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2306fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2307fcdce8c4SStefano Zampini   PetscBool                    flg;
2308fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2309fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2310fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2311fcdce8c4SStefano Zampini   MatProductType               ptype;
2312fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2313fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2314fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2315fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2316fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2317fcdce8c4SStefano Zampini   size_t                       bufSize2;
2318fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2319fcdce8c4SStefano Zampini #else
2320fcdce8c4SStefano Zampini   int                          cnz;
2321fcdce8c4SStefano Zampini #endif
2322fcdce8c4SStefano Zampini 
2323fcdce8c4SStefano Zampini   PetscFunctionBegin;
2324fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2325fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2326fcdce8c4SStefano Zampini   A    = product->A;
2327fcdce8c4SStefano Zampini   B    = product->B;
2328fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2329fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2330fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2331fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2332fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2333fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2334fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2335fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2336fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2337fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2338fcdce8c4SStefano Zampini 
2339fcdce8c4SStefano Zampini   /* product data */
2340fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2341fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2342fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2343fcdce8c4SStefano Zampini 
2344fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2345fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2346fcdce8c4SStefano Zampini   ptype = product->type;
2347fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2348fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2349fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2350fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2351fcdce8c4SStefano Zampini   switch (ptype) {
2352fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2353fcdce8c4SStefano Zampini     m = A->rmap->n;
2354fcdce8c4SStefano Zampini     n = B->cmap->n;
2355fcdce8c4SStefano Zampini     k = A->cmap->n;
2356fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2357fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2358fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2359fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2360fcdce8c4SStefano Zampini     break;
2361fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2362fcdce8c4SStefano Zampini     m = A->cmap->n;
2363fcdce8c4SStefano Zampini     n = B->cmap->n;
2364fcdce8c4SStefano Zampini     k = A->rmap->n;
2365fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2366fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2367fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2368fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2369fcdce8c4SStefano Zampini     break;
2370fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2371fcdce8c4SStefano Zampini     m = A->rmap->n;
2372fcdce8c4SStefano Zampini     n = B->rmap->n;
2373fcdce8c4SStefano Zampini     k = A->cmap->n;
2374fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2375fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2376fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2377fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2378fcdce8c4SStefano Zampini     break;
2379fcdce8c4SStefano Zampini   default:
2380fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2381fcdce8c4SStefano Zampini   }
2382fcdce8c4SStefano Zampini 
2383fcdce8c4SStefano Zampini   /* create cusparse matrix */
2384fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2385fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2386fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2387fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2388fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2389fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2390fcdce8c4SStefano Zampini 
2391fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2392fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2393fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2394fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2395fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2396fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2397fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2398fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2399fcdce8c4SStefano Zampini   } else {
2400fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2401fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2402fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2403fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2404fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2405fcdce8c4SStefano Zampini   }
2406fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2407fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2408fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2409fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2410fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2411fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2412fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2413fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2414fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2415fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2416fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2417fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2418fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2419fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2420fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2421fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2422fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2423fcdce8c4SStefano Zampini     c->nz = 0;
2424fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2425fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2426fcdce8c4SStefano Zampini     goto finalizesym;
2427fcdce8c4SStefano Zampini   }
2428fcdce8c4SStefano Zampini 
2429fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2430fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2431fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2432fcdce8c4SStefano Zampini   if (!biscompressed) {
2433fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2434fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2435fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2436fcdce8c4SStefano Zampini #endif
2437fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2438fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2439fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2440fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2441fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2442fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2443fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2444fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2445fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2446fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2447fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2448fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2449fcdce8c4SStefano Zampini     }
2450fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2451fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2452fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2453fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2454fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2455fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2456fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2457fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2458fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2459fcdce8c4SStefano Zampini     }
2460fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2461fcdce8c4SStefano Zampini #endif
2462fcdce8c4SStefano Zampini   }
2463fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2464fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2465fcdce8c4SStefano Zampini   /* precompute flops count */
2466fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2467fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2468fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2469fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2470fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2471fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2472fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2473fcdce8c4SStefano Zampini       }
2474fcdce8c4SStefano Zampini     }
2475fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2476fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2477fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2478fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2479fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2480fcdce8c4SStefano Zampini     }
2481fcdce8c4SStefano Zampini   } else { /* TODO */
2482fcdce8c4SStefano Zampini     flops = 0.;
2483fcdce8c4SStefano Zampini   }
2484fcdce8c4SStefano Zampini 
2485fcdce8c4SStefano Zampini   mmdata->flops = flops;
2486fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2487fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2488fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2489fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2490fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2491fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2492fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2493fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2494fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2495fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2496fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2497fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2498fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2499bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2500fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2501fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2502fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2503fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2504fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2505fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2506fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2507fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2508fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2509fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2510fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2511fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2512fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2513fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2514fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2515bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2516fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2517fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2518fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2519fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2520fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2521fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2522fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2523fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
2524fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2525fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2526fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2527fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2528fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2529fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2530fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2531fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2532fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2533fcdce8c4SStefano Zampini #else
2534fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2535fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2536fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2537fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2538fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2539fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2540fcdce8c4SStefano Zampini   c->nz = cnz;
2541fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2542fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2543fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2544fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2545fcdce8c4SStefano Zampini 
2546fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2547fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2548fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2549fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2550fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2551fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2552fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2553fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2554fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2555fcdce8c4SStefano Zampini #endif
2556fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2557fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2558fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2559fcdce8c4SStefano Zampini finalizesym:
2560fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2561fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2562fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2563fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2564fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2565fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2566fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2567fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2568fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2569fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2570fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2571fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2572fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2573fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2574fcdce8c4SStefano Zampini   } else {
2575fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2576fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2577fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2578fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2579fcdce8c4SStefano Zampini   }
2580fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2581fcdce8c4SStefano Zampini     PetscInt r = 0;
2582fcdce8c4SStefano Zampini     c->i[0] = 0;
2583fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2584fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2585fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2586fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2587fcdce8c4SStefano Zampini     }
2588fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2589fcdce8c4SStefano Zampini   }
2590fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2591fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2592fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2593fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2594fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2595fcdce8c4SStefano Zampini   c->rmax = 0;
2596fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2597fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2598fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2599fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2600fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2601fcdce8c4SStefano Zampini   }
2602fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2603fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2604fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2605fcdce8c4SStefano Zampini 
2606fcdce8c4SStefano Zampini   C->nonzerostate++;
2607fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2608fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2609fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2610fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2611fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2612fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2613fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2614abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2615fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2616fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2617fcdce8c4SStefano Zampini   }
2618fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2619fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2620fcdce8c4SStefano Zampini }
2621fcdce8c4SStefano Zampini 
2622fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2623fcdce8c4SStefano Zampini 
2624fcdce8c4SStefano Zampini /* handles sparse or dense B */
2625fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2626fcdce8c4SStefano Zampini {
2627fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2628fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2629fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2630fcdce8c4SStefano Zampini 
2631fcdce8c4SStefano Zampini   PetscFunctionBegin;
2632fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2633fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2634abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2635fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2636fcdce8c4SStefano Zampini   }
2637fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2638fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2639fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2640fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2641fcdce8c4SStefano Zampini     }
2642fcdce8c4SStefano Zampini   }
2643fcdce8c4SStefano Zampini   if (isdense) {
2644ccdfe979SStefano Zampini     switch (product->type) {
2645ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2646ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2647ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2648ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2649ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2650fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2651fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2652fcdce8c4SStefano Zampini       } else {
2653fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2654fcdce8c4SStefano Zampini       }
2655fcdce8c4SStefano Zampini       break;
2656fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2657fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2658fcdce8c4SStefano Zampini       break;
2659ccdfe979SStefano Zampini     default:
2660ccdfe979SStefano Zampini       break;
2661ccdfe979SStefano Zampini     }
2662fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2663fcdce8c4SStefano Zampini     switch (product->type) {
2664fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2665fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2666fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2667fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2668fcdce8c4SStefano Zampini       break;
2669fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2670fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2671fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2672fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2673fcdce8c4SStefano Zampini       break;
2674fcdce8c4SStefano Zampini     default:
2675fcdce8c4SStefano Zampini       break;
2676fcdce8c4SStefano Zampini     }
2677fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2678fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2679fcdce8c4SStefano Zampini   }
2680ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2681ccdfe979SStefano Zampini }
2682ccdfe979SStefano Zampini 
26836fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
26849ae82921SPaul Mullowney {
2685b175d8bbSPaul Mullowney   PetscErrorCode ierr;
26869ae82921SPaul Mullowney 
26879ae82921SPaul Mullowney   PetscFunctionBegin;
2688e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2689e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2690e6e9a74fSStefano Zampini }
2691e6e9a74fSStefano Zampini 
2692e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2693e6e9a74fSStefano Zampini {
2694e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2695e6e9a74fSStefano Zampini 
2696e6e9a74fSStefano Zampini   PetscFunctionBegin;
2697e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2698e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2699e6e9a74fSStefano Zampini }
2700e6e9a74fSStefano Zampini 
2701e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2702e6e9a74fSStefano Zampini {
2703e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2704e6e9a74fSStefano Zampini 
2705e6e9a74fSStefano Zampini   PetscFunctionBegin;
2706e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2707e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2708e6e9a74fSStefano Zampini }
2709e6e9a74fSStefano Zampini 
2710e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2711e6e9a74fSStefano Zampini {
2712e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2713e6e9a74fSStefano Zampini 
2714e6e9a74fSStefano Zampini   PetscFunctionBegin;
2715e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27169ae82921SPaul Mullowney   PetscFunctionReturn(0);
27179ae82921SPaul Mullowney }
27189ae82921SPaul Mullowney 
27196fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2720ca45077fSPaul Mullowney {
2721b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2722ca45077fSPaul Mullowney 
2723ca45077fSPaul Mullowney   PetscFunctionBegin;
2724e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2725ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2726ca45077fSPaul Mullowney }
2727ca45077fSPaul Mullowney 
2728afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2729e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27309ae82921SPaul Mullowney {
27319ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2732aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27339ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2734e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2735b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
273657d48284SJunchao Zhang   cudaError_t                  cerr;
2737aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2738e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2739e6e9a74fSStefano Zampini   PetscBool                    compressed;
2740afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2741afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2742afb2bd1cSJunchao Zhang #endif
27436e111a19SKarl Rupp 
27449ae82921SPaul Mullowney   PetscFunctionBegin;
2745e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2746e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2747afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2748d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2749e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2750e6e9a74fSStefano Zampini   }
275134d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
275234d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2753e6e9a74fSStefano Zampini   if (!trans) {
27549ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2755c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2756e6e9a74fSStefano Zampini   } else {
2757e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2758e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2759e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2760e6e9a74fSStefano Zampini     } else {
2761afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2762e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2763e6e9a74fSStefano Zampini     }
2764e6e9a74fSStefano Zampini   }
2765e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2766e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2767213423ffSJunchao Zhang 
2768e6e9a74fSStefano Zampini   try {
2769e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2770213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2771213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2772afb2bd1cSJunchao Zhang 
277385ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2774e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2775afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2776afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2777afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2778afb2bd1cSJunchao Zhang       */
2779e6e9a74fSStefano Zampini       xptr = xarray;
2780afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2781213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2782afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2783afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2784afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2785afb2bd1cSJunchao Zhang        */
2786afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2787afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2788afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2789afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2790afb2bd1cSJunchao Zhang       }
2791afb2bd1cSJunchao Zhang      #endif
2792e6e9a74fSStefano Zampini     } else {
2793afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2794afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2795afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2796afb2bd1cSJunchao Zhang        */
2797afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2798e6e9a74fSStefano Zampini       dptr = zarray;
2799e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2800afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2801e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2802e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2803e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2804e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2805e6e9a74fSStefano Zampini       }
2806afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2807afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2808afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2809afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2810afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2811afb2bd1cSJunchao Zhang       }
2812afb2bd1cSJunchao Zhang      #endif
2813e6e9a74fSStefano Zampini     }
28149ae82921SPaul Mullowney 
2815afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2816aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2817afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2818afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2819afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2820afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2821afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2822afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2823afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2824afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2825afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2826afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2827afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2828afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2829afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2830afb2bd1cSJunchao Zhang 
2831afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2832afb2bd1cSJunchao Zhang       } else {
2833afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2834afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2835afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2836afb2bd1cSJunchao Zhang       }
2837afb2bd1cSJunchao Zhang 
2838afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2839afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2840afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2841afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2842afb2bd1cSJunchao Zhang                                beta,
2843afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2844afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2845afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2846afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2847afb2bd1cSJunchao Zhang      #else
28487656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2849e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2850a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2851afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2852aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2853e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
285457d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2855afb2bd1cSJunchao Zhang      #endif
2856aa372e3fSPaul Mullowney     } else {
2857213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2858afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2859afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2860afb2bd1cSJunchao Zhang        #else
2861301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2862e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2863afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2864e6e9a74fSStefano Zampini                                  xptr, beta,
286557d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2866afb2bd1cSJunchao Zhang        #endif
2867a65300a6SPaul Mullowney       }
2868aa372e3fSPaul Mullowney     }
286905035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2870958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2871aa372e3fSPaul Mullowney 
2872e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2873213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2874213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2875213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2876e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2877213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
28787656d835SStefano Zampini         }
2879213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2880c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
28817656d835SStefano Zampini       }
28827656d835SStefano Zampini 
2883213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2884213423ffSJunchao Zhang       if (compressed) {
2885213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2886e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2887c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2888e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2889c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
289005035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2891958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2892e6e9a74fSStefano Zampini       }
2893e6e9a74fSStefano Zampini     } else {
2894e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2895e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2896e6e9a74fSStefano Zampini       }
2897e6e9a74fSStefano Zampini     }
2898e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2899213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2900213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29019ae82921SPaul Mullowney   } catch(char *ex) {
29029ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29039ae82921SPaul Mullowney   }
2904e6e9a74fSStefano Zampini   if (yy) {
2905958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2906e6e9a74fSStefano Zampini   } else {
2907e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2908e6e9a74fSStefano Zampini   }
29099ae82921SPaul Mullowney   PetscFunctionReturn(0);
29109ae82921SPaul Mullowney }
29119ae82921SPaul Mullowney 
29126fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2913ca45077fSPaul Mullowney {
2914b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29156e111a19SKarl Rupp 
2916ca45077fSPaul Mullowney   PetscFunctionBegin;
2917e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2918ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2919ca45077fSPaul Mullowney }
2920ca45077fSPaul Mullowney 
29216fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29229ae82921SPaul Mullowney {
29239ae82921SPaul Mullowney   PetscErrorCode              ierr;
2924a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29259ae82921SPaul Mullowney   PetscFunctionBegin;
2926bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29273fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2928bc3f50f2SPaul Mullowney   }
29293fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29303fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2931a587d139SMark   if (d_mat) {
29323fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29333fa6b06aSMark Adams   }
29343fa6b06aSMark Adams 
29359ae82921SPaul Mullowney   PetscFunctionReturn(0);
29369ae82921SPaul Mullowney }
29379ae82921SPaul Mullowney 
29389ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2939e057df02SPaul Mullowney /*@
29409ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2941e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2942e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2943e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2944e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2945e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29469ae82921SPaul Mullowney 
2947d083f849SBarry Smith    Collective
29489ae82921SPaul Mullowney 
29499ae82921SPaul Mullowney    Input Parameters:
29509ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29519ae82921SPaul Mullowney .  m - number of rows
29529ae82921SPaul Mullowney .  n - number of columns
29539ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29549ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29550298fd71SBarry Smith          (possibly different for each row) or NULL
29569ae82921SPaul Mullowney 
29579ae82921SPaul Mullowney    Output Parameter:
29589ae82921SPaul Mullowney .  A - the matrix
29599ae82921SPaul Mullowney 
29609ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
29619ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
29629ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
29639ae82921SPaul Mullowney 
29649ae82921SPaul Mullowney    Notes:
29659ae82921SPaul Mullowney    If nnz is given then nz is ignored
29669ae82921SPaul Mullowney 
29679ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
29689ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
29699ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
29709ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
29719ae82921SPaul Mullowney 
29729ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
29730298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
29749ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
29759ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
29769ae82921SPaul Mullowney 
29779ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
29789ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
29799ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
29809ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
29819ae82921SPaul Mullowney 
29829ae82921SPaul Mullowney    Level: intermediate
29839ae82921SPaul Mullowney 
2984e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
29859ae82921SPaul Mullowney @*/
29869ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
29879ae82921SPaul Mullowney {
29889ae82921SPaul Mullowney   PetscErrorCode ierr;
29899ae82921SPaul Mullowney 
29909ae82921SPaul Mullowney   PetscFunctionBegin;
29919ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
29929ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
29939ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
29949ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
29959ae82921SPaul Mullowney   PetscFunctionReturn(0);
29969ae82921SPaul Mullowney }
29979ae82921SPaul Mullowney 
29986fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
29999ae82921SPaul Mullowney {
30009ae82921SPaul Mullowney   PetscErrorCode              ierr;
30013fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3002ab25e6cbSDominic Meiser 
30039ae82921SPaul Mullowney   PetscFunctionBegin;
30049ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30053fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30063fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3007470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30089ae82921SPaul Mullowney   } else {
3009470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3010aa372e3fSPaul Mullowney   }
30113fa6b06aSMark Adams   if (d_mat) {
30123fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30133fa6b06aSMark Adams     cudaError_t                err;
30143fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30153fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30163fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30173fa6b06aSMark Adams     if (a->compressedrow.use) {
30183fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30193fa6b06aSMark Adams     }
30203fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30213fa6b06aSMark Adams   }
3022ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3023ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3024ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3025fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3026ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30277e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30287e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30299ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30309ae82921SPaul Mullowney   PetscFunctionReturn(0);
30319ae82921SPaul Mullowney }
30329ae82921SPaul Mullowney 
3033ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
303495639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30359ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30369ff858a8SKarl Rupp {
30379ff858a8SKarl Rupp   PetscErrorCode ierr;
30389ff858a8SKarl Rupp 
30399ff858a8SKarl Rupp   PetscFunctionBegin;
30409ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3041ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30429ff858a8SKarl Rupp   PetscFunctionReturn(0);
30439ff858a8SKarl Rupp }
30449ff858a8SKarl Rupp 
3045*039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
304695639643SRichard Tran Mills {
3047e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3048a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3049*039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3050*039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3051*039c6fbaSStefano Zampini   PetscScalar        *ay;
3052*039c6fbaSStefano Zampini   const PetscScalar  *ax;
3053*039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3054*039c6fbaSStefano Zampini   cudaError_t        cerr;
3055e6e9a74fSStefano Zampini 
305695639643SRichard Tran Mills   PetscFunctionBegin;
3057*039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3058a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3059a587d139SMark     PetscFunctionReturn(0);
306095639643SRichard Tran Mills   }
3061*039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3062a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3063a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3064*039c6fbaSStefano Zampini   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3065*039c6fbaSStefano Zampini   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3066*039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3067*039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3068*039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3069*039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3070*039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3071*039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3072*039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3073*039c6fbaSStefano Zampini     if (eq) {
3074*039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3075*039c6fbaSStefano Zampini     }
3076*039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3077*039c6fbaSStefano Zampini   }
3078*039c6fbaSStefano Zampini 
3079*039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3080*039c6fbaSStefano Zampini     cusparseStatus_t stat;
3081*039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3082*039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3083*039c6fbaSStefano Zampini     size_t           bufferSize;
3084*039c6fbaSStefano Zampini     void             *buffer;
3085*039c6fbaSStefano Zampini #endif
3086*039c6fbaSStefano Zampini 
3087*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3088*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3089*039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3090*039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3091*039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3092*039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3093*039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3094*039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3095*039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3096*039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3097*039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3098*039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3099*039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3100*039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3101*039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3102*039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3103*039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3104*039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3105*039c6fbaSStefano Zampini #else
3106*039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3107*039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3108*039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3109*039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3110*039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3111*039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3112*039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3113*039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3114*039c6fbaSStefano Zampini #endif
3115*039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3116*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3117*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3118*039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3119*039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3120a587d139SMark     cublasHandle_t cublasv2handle;
3121*039c6fbaSStefano Zampini     cublasStatus_t berr;
3122a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3123*039c6fbaSStefano Zampini 
3124*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3125*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3126a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3127a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3128a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3129*039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3130*039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3131a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3132a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3133*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3134*039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3135a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3136*039c6fbaSStefano Zampini   } else {
3137*039c6fbaSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
3138a587d139SMark   }
313995639643SRichard Tran Mills   PetscFunctionReturn(0);
314095639643SRichard Tran Mills }
314195639643SRichard Tran Mills 
31423fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
31433fa6b06aSMark Adams {
31443fa6b06aSMark Adams   PetscErrorCode             ierr;
31457e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3146a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
31477e8381f9SStefano Zampini 
31483fa6b06aSMark Adams   PetscFunctionBegin;
31493fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
31503fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
31517e8381f9SStefano Zampini     if (spptr->mat) {
31527e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
31537e8381f9SStefano Zampini       if (matrix->values) {
31547e8381f9SStefano Zampini         both = PETSC_TRUE;
31557e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31567e8381f9SStefano Zampini       }
31577e8381f9SStefano Zampini     }
31587e8381f9SStefano Zampini     if (spptr->matTranspose) {
31597e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
31607e8381f9SStefano Zampini       if (matrix->values) {
31617e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31627e8381f9SStefano Zampini       }
31637e8381f9SStefano Zampini     }
31643fa6b06aSMark Adams   }
3165a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3166a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3167a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
31687e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3169a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
31703fa6b06aSMark Adams 
31713fa6b06aSMark Adams   PetscFunctionReturn(0);
31723fa6b06aSMark Adams }
31733fa6b06aSMark Adams 
3174a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3175a587d139SMark {
3176a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3177a587d139SMark   PetscErrorCode ierr;
3178a587d139SMark 
3179a587d139SMark   PetscFunctionBegin;
3180a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3181a587d139SMark   if (flg) {
3182a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3183a587d139SMark 
3184a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3185a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3186a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3187a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3188a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3189a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3190a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3191a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3192fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3193a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3194a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3195a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3196a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3197a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3198fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3199a587d139SMark   } else {
3200a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3201a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3202a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3203a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3204a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3205a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3206a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3207a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3208fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3209a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3210a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3211a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3212a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3213a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3214fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3215a587d139SMark   }
3216a587d139SMark   A->boundtocpu = flg;
3217a587d139SMark   a->inode.use = flg;
3218a587d139SMark   PetscFunctionReturn(0);
3219a587d139SMark }
3220a587d139SMark 
322149735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32229ae82921SPaul Mullowney {
32239ae82921SPaul Mullowney   PetscErrorCode   ierr;
3224aa372e3fSPaul Mullowney   cusparseStatus_t stat;
322549735bf3SStefano Zampini   Mat              B;
32269ae82921SPaul Mullowney 
32279ae82921SPaul Mullowney   PetscFunctionBegin;
3228832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
322949735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
323049735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
323149735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
323249735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
323349735bf3SStefano Zampini   }
323449735bf3SStefano Zampini   B = *newmat;
323549735bf3SStefano Zampini 
323634136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
323734136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
323834136279SStefano Zampini 
323949735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
32409ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3241e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3242e6e9a74fSStefano Zampini 
3243e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3244e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
3245e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3246e6e9a74fSStefano Zampini       B->spptr = spptr;
32473fa6b06aSMark Adams       spptr->deviceMat = NULL;
32489ae82921SPaul Mullowney     } else {
3249e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3250e6e9a74fSStefano Zampini 
3251e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3252e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3253e6e9a74fSStefano Zampini       B->spptr = spptr;
32549ae82921SPaul Mullowney     }
3255e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
325649735bf3SStefano Zampini   }
3257693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
32589ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
32599ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
326095639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3261693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
32622205254eSKarl Rupp 
3263e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
32649ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3265bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
32669ae82921SPaul Mullowney   PetscFunctionReturn(0);
32679ae82921SPaul Mullowney }
32689ae82921SPaul Mullowney 
326902fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
327002fe1965SBarry Smith {
327102fe1965SBarry Smith   PetscErrorCode ierr;
327202fe1965SBarry Smith 
327302fe1965SBarry Smith   PetscFunctionBegin;
327402fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
32750ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3276afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3277afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3278afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
327902fe1965SBarry Smith   PetscFunctionReturn(0);
328002fe1965SBarry Smith }
328102fe1965SBarry Smith 
32823ca39a21SBarry Smith /*MC
3283e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3284e057df02SPaul Mullowney 
3285e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
32862692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
32872692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3288e057df02SPaul Mullowney 
3289e057df02SPaul Mullowney    Options Database Keys:
3290e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3291aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3292a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3293e057df02SPaul Mullowney 
3294e057df02SPaul Mullowney   Level: beginner
3295e057df02SPaul Mullowney 
32968468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3297e057df02SPaul Mullowney M*/
32987f756511SDominic Meiser 
329942c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
330042c9c57cSBarry Smith 
33010f39cd5aSBarry Smith 
33023ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
330342c9c57cSBarry Smith {
330442c9c57cSBarry Smith   PetscErrorCode ierr;
330542c9c57cSBarry Smith 
330642c9c57cSBarry Smith   PetscFunctionBegin;
33073ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33083ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33093ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33103ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
331142c9c57cSBarry Smith   PetscFunctionReturn(0);
331242c9c57cSBarry Smith }
331329b38603SBarry Smith 
3314470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33157f756511SDominic Meiser {
3316e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33177f756511SDominic Meiser   cusparseStatus_t stat;
33187f756511SDominic Meiser 
33197f756511SDominic Meiser   PetscFunctionBegin;
33207f756511SDominic Meiser   if (*cusparsestruct) {
3321e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3322e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
33237f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
332481902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
33257e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
33267e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
33277e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3328afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3329afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3330afb2bd1cSJunchao Zhang    #endif
3331e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
33327f756511SDominic Meiser   }
33337f756511SDominic Meiser   PetscFunctionReturn(0);
33347f756511SDominic Meiser }
33357f756511SDominic Meiser 
33367f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
33377f756511SDominic Meiser {
33387f756511SDominic Meiser   PetscFunctionBegin;
33397f756511SDominic Meiser   if (*mat) {
33407f756511SDominic Meiser     delete (*mat)->values;
33417f756511SDominic Meiser     delete (*mat)->column_indices;
33427f756511SDominic Meiser     delete (*mat)->row_offsets;
33437f756511SDominic Meiser     delete *mat;
33447f756511SDominic Meiser     *mat = 0;
33457f756511SDominic Meiser   }
33467f756511SDominic Meiser   PetscFunctionReturn(0);
33477f756511SDominic Meiser }
33487f756511SDominic Meiser 
3349470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
33507f756511SDominic Meiser {
33517f756511SDominic Meiser   cusparseStatus_t stat;
33527f756511SDominic Meiser   PetscErrorCode   ierr;
33537f756511SDominic Meiser 
33547f756511SDominic Meiser   PetscFunctionBegin;
33557f756511SDominic Meiser   if (*trifactor) {
335657d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3357afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
33587f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
33591b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
33602cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3361afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33621b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3363afb2bd1cSJunchao Zhang    #endif
3364da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
33657f756511SDominic Meiser   }
33667f756511SDominic Meiser   PetscFunctionReturn(0);
33677f756511SDominic Meiser }
33687f756511SDominic Meiser 
3369470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
33707f756511SDominic Meiser {
33717f756511SDominic Meiser   CsrMatrix        *mat;
33727f756511SDominic Meiser   cusparseStatus_t stat;
33737f756511SDominic Meiser   cudaError_t      err;
33747f756511SDominic Meiser 
33757f756511SDominic Meiser   PetscFunctionBegin;
33767f756511SDominic Meiser   if (*matstruct) {
33777f756511SDominic Meiser     if ((*matstruct)->mat) {
33787f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3379afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3380afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3381afb2bd1cSJunchao Zhang        #else
33827f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
338357d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3384afb2bd1cSJunchao Zhang        #endif
33857f756511SDominic Meiser       } else {
33867f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
33877f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
33887f756511SDominic Meiser       }
33897f756511SDominic Meiser     }
339057d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
33917f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3392afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
33937656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
33947656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3395afb2bd1cSJunchao Zhang 
3396afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3397afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3398afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3399afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3400afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3401afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3402afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3403afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3404afb2bd1cSJunchao Zhang       }
3405afb2bd1cSJunchao Zhang     }
3406afb2bd1cSJunchao Zhang    #endif
34077f756511SDominic Meiser     delete *matstruct;
34087e8381f9SStefano Zampini     *matstruct = NULL;
34097f756511SDominic Meiser   }
34107f756511SDominic Meiser   PetscFunctionReturn(0);
34117f756511SDominic Meiser }
34127f756511SDominic Meiser 
3413ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
34147f756511SDominic Meiser {
3415e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3416e6e9a74fSStefano Zampini 
34177f756511SDominic Meiser   PetscFunctionBegin;
34187f756511SDominic Meiser   if (*trifactors) {
3419e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3420e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3421e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3422e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
34237f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
34247f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
34257f756511SDominic Meiser     delete (*trifactors)->workVector;
34267e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
34277e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
34287e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3429ccdfe979SStefano Zampini   }
3430ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3431ccdfe979SStefano Zampini }
3432ccdfe979SStefano Zampini 
3433ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3434ccdfe979SStefano Zampini {
3435e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3436ccdfe979SStefano Zampini   cusparseHandle_t handle;
3437ccdfe979SStefano Zampini   cusparseStatus_t stat;
3438ccdfe979SStefano Zampini 
3439ccdfe979SStefano Zampini   PetscFunctionBegin;
3440ccdfe979SStefano Zampini   if (*trifactors) {
3441e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
34427f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
344357d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
34447f756511SDominic Meiser     }
3445e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
34467f756511SDominic Meiser   }
34477f756511SDominic Meiser   PetscFunctionReturn(0);
34487f756511SDominic Meiser }
34497e8381f9SStefano Zampini 
34507e8381f9SStefano Zampini struct IJCompare
34517e8381f9SStefano Zampini {
34527e8381f9SStefano Zampini   __host__ __device__
34537e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
34547e8381f9SStefano Zampini   {
34557e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
34567e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
34577e8381f9SStefano Zampini     return false;
34587e8381f9SStefano Zampini   }
34597e8381f9SStefano Zampini };
34607e8381f9SStefano Zampini 
34617e8381f9SStefano Zampini struct IJEqual
34627e8381f9SStefano Zampini {
34637e8381f9SStefano Zampini   __host__ __device__
34647e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
34657e8381f9SStefano Zampini   {
34667e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
34677e8381f9SStefano Zampini     return true;
34687e8381f9SStefano Zampini   }
34697e8381f9SStefano Zampini };
34707e8381f9SStefano Zampini 
34717e8381f9SStefano Zampini struct IJDiff
34727e8381f9SStefano Zampini {
34737e8381f9SStefano Zampini   __host__ __device__
34747e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
34757e8381f9SStefano Zampini   {
34767e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
34777e8381f9SStefano Zampini   }
34787e8381f9SStefano Zampini };
34797e8381f9SStefano Zampini 
34807e8381f9SStefano Zampini struct IJSum
34817e8381f9SStefano Zampini {
34827e8381f9SStefano Zampini   __host__ __device__
34837e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
34847e8381f9SStefano Zampini   {
34857e8381f9SStefano Zampini     return t1||t2;
34867e8381f9SStefano Zampini   }
34877e8381f9SStefano Zampini };
34887e8381f9SStefano Zampini 
34897e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3490e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
34917e8381f9SStefano Zampini {
34927e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3493fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3494bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
349508391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
34967e8381f9SStefano Zampini   CsrMatrix                             *matrix;
34977e8381f9SStefano Zampini   PetscErrorCode                        ierr;
34987e8381f9SStefano Zampini   cudaError_t                           cerr;
34997e8381f9SStefano Zampini   PetscInt                              n;
35007e8381f9SStefano Zampini 
35017e8381f9SStefano Zampini   PetscFunctionBegin;
35027e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35037e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35047e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35057e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35067e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35077e8381f9SStefano Zampini     PetscFunctionReturn(0);
35087e8381f9SStefano Zampini   }
35097e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35107e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3511e61fc153SStefano Zampini   if (!v) {
3512e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3513e61fc153SStefano Zampini     goto finalize;
35147e8381f9SStefano Zampini   }
3515e61fc153SStefano Zampini   n = cusp->cooPerm->size();
351608391a17SStefano Zampini   if (isCudaMem(v)) {
351708391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
351808391a17SStefano Zampini   } else {
3519e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3520e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
352108391a17SStefano Zampini     d_v = cooPerm_v->data();
3522e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
352308391a17SStefano Zampini   }
3524bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3525e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
35267e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3527bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
352808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3529e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3530e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3531e61fc153SStefano Zampini       delete cooPerm_w;
35327e8381f9SStefano Zampini     } else {
353308391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
35347e8381f9SStefano Zampini                                                                 matrix->values->begin()));
353508391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
35367e8381f9SStefano Zampini                                                                 matrix->values->end()));
35377e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
35387e8381f9SStefano Zampini     }
35397e8381f9SStefano Zampini   } else {
3540e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
354108391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3542e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
35437e8381f9SStefano Zampini     } else {
354408391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
35457e8381f9SStefano Zampini                                                                 matrix->values->begin()));
354608391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
35477e8381f9SStefano Zampini                                                                 matrix->values->end()));
35487e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
35497e8381f9SStefano Zampini     }
35507e8381f9SStefano Zampini   }
35517e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3552bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3553e61fc153SStefano Zampini finalize:
3554e61fc153SStefano Zampini   delete cooPerm_v;
35557e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3556e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3557fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3558fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3559fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3560fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3561fcdce8c4SStefano Zampini   a->reallocs         = 0;
3562fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3563fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3564fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3565fcdce8c4SStefano Zampini   A->num_ass++;
35667e8381f9SStefano Zampini   PetscFunctionReturn(0);
35677e8381f9SStefano Zampini }
35687e8381f9SStefano Zampini 
35697e8381f9SStefano Zampini #include <thrust/binary_search.h>
3570e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
35717e8381f9SStefano Zampini {
35727e8381f9SStefano Zampini   PetscErrorCode     ierr;
35737e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
35747e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
35757e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
35767e8381f9SStefano Zampini   cudaError_t        cerr;
35777e8381f9SStefano Zampini 
35787e8381f9SStefano Zampini   PetscFunctionBegin;
35797e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
35807e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
35817e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
35827e8381f9SStefano Zampini   if (n != cooPerm_n) {
35837e8381f9SStefano Zampini     delete cusp->cooPerm;
35847e8381f9SStefano Zampini     delete cusp->cooPerm_a;
35857e8381f9SStefano Zampini     cusp->cooPerm = NULL;
35867e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
35877e8381f9SStefano Zampini   }
35887e8381f9SStefano Zampini   if (n) {
35897e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
35907e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
35917e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
35927e8381f9SStefano Zampini 
35937e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
35947e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
35957e8381f9SStefano Zampini 
35967e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
35977e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
35987e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
35997e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36007e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36017e8381f9SStefano Zampini 
360208391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36037e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
36047e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
36057e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
36067e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
36077e8381f9SStefano Zampini 
36087e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
36097e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
36107e8381f9SStefano Zampini       delete cusp->cooPerm_a;
36117e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
36127e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
36137e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
36147e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
36157e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
36167e8381f9SStefano Zampini       w[0] = 0;
36177e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
36187e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
36197e8381f9SStefano Zampini     }
36207e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
36217e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
36227e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
36237e8381f9SStefano Zampini                         ii.begin());
362408391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
362508391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
36267e8381f9SStefano Zampini 
36277e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
36287e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
36297e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
36307e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
36317e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
36327e8381f9SStefano Zampini     a->i[0] = 0;
36337e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36347e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3635fcdce8c4SStefano Zampini     a->rmax = 0;
36367e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
36377e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
36387e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36397e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
36407e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
36417e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
36427e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
36437e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
36447e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3645fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
36467e8381f9SStefano Zampini     }
3647fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
36487e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
36497e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3650fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
36517e8381f9SStefano Zampini   } else {
36527e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
36537e8381f9SStefano Zampini   }
3654e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
36557e8381f9SStefano Zampini 
36567e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3657e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3658e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
36597e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
36607e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
36617e8381f9SStefano Zampini   A->nonzerostate++;
36627e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
36637e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
36647e8381f9SStefano Zampini 
36657e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
36667e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
36677e8381f9SStefano Zampini   PetscFunctionReturn(0);
36687e8381f9SStefano Zampini }
3669ed502f03SStefano Zampini 
3670ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3671ed502f03SStefano Zampini {
3672ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3673ed502f03SStefano Zampini   CsrMatrix          *csr;
3674ed502f03SStefano Zampini   PetscErrorCode     ierr;
3675ed502f03SStefano Zampini 
3676ed502f03SStefano Zampini   PetscFunctionBegin;
3677ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3678ed502f03SStefano Zampini   PetscValidPointer(a,2);
3679ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3680ed502f03SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3681ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3682ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3683ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3684ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3685ed502f03SStefano Zampini   *a = csr->values->data().get();
3686ed502f03SStefano Zampini   PetscFunctionReturn(0);
3687ed502f03SStefano Zampini }
3688ed502f03SStefano Zampini 
3689ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3690ed502f03SStefano Zampini {
3691ed502f03SStefano Zampini   PetscFunctionBegin;
3692ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3693ed502f03SStefano Zampini   PetscValidPointer(a,2);
3694ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3695ed502f03SStefano Zampini   *a = NULL;
3696ed502f03SStefano Zampini   PetscFunctionReturn(0);
3697ed502f03SStefano Zampini }
3698ed502f03SStefano Zampini 
3699*039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3700*039c6fbaSStefano Zampini {
3701*039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3702*039c6fbaSStefano Zampini   CsrMatrix          *csr;
3703*039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3704*039c6fbaSStefano Zampini 
3705*039c6fbaSStefano Zampini   PetscFunctionBegin;
3706*039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3707*039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3708*039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3709*039c6fbaSStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3710*039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3711*039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3712*039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3713*039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3714*039c6fbaSStefano Zampini   *a = csr->values->data().get();
3715*039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3716*039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3717*039c6fbaSStefano Zampini }
3718*039c6fbaSStefano Zampini 
3719*039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3720*039c6fbaSStefano Zampini {
3721*039c6fbaSStefano Zampini   PetscErrorCode ierr;
3722*039c6fbaSStefano Zampini 
3723*039c6fbaSStefano Zampini   PetscFunctionBegin;
3724*039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3725*039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3726*039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3727*039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3728*039c6fbaSStefano Zampini   *a = NULL;
3729*039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3730*039c6fbaSStefano Zampini }
3731*039c6fbaSStefano Zampini 
3732ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3733ed502f03SStefano Zampini {
3734ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3735ed502f03SStefano Zampini   CsrMatrix          *csr;
3736ed502f03SStefano Zampini 
3737ed502f03SStefano Zampini   PetscFunctionBegin;
3738ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3739ed502f03SStefano Zampini   PetscValidPointer(a,2);
3740ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3741ed502f03SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3742ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3743ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3744ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3745ed502f03SStefano Zampini   *a = csr->values->data().get();
3746*039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3747ed502f03SStefano Zampini   PetscFunctionReturn(0);
3748ed502f03SStefano Zampini }
3749ed502f03SStefano Zampini 
3750ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3751ed502f03SStefano Zampini {
3752ed502f03SStefano Zampini   PetscErrorCode ierr;
3753ed502f03SStefano Zampini 
3754ed502f03SStefano Zampini   PetscFunctionBegin;
3755ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3756ed502f03SStefano Zampini   PetscValidPointer(a,2);
3757ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3758ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3759ed502f03SStefano Zampini   *a = NULL;
3760ed502f03SStefano Zampini   PetscFunctionReturn(0);
3761ed502f03SStefano Zampini }
3762ed502f03SStefano Zampini 
3763ed502f03SStefano Zampini struct IJCompare4
3764ed502f03SStefano Zampini {
3765ed502f03SStefano Zampini   __host__ __device__
37662ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3767ed502f03SStefano Zampini   {
3768ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3769ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3770ed502f03SStefano Zampini     return false;
3771ed502f03SStefano Zampini   }
3772ed502f03SStefano Zampini };
3773ed502f03SStefano Zampini 
37748909a122SStefano Zampini struct Shift
37758909a122SStefano Zampini {
3776ed502f03SStefano Zampini   int _shift;
3777ed502f03SStefano Zampini 
3778ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3779ed502f03SStefano Zampini   __host__ __device__
3780ed502f03SStefano Zampini   inline int operator() (const int &c)
3781ed502f03SStefano Zampini   {
3782ed502f03SStefano Zampini     return c + _shift;
3783ed502f03SStefano Zampini   }
3784ed502f03SStefano Zampini };
3785ed502f03SStefano Zampini 
3786ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3787ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3788ed502f03SStefano Zampini {
3789ed502f03SStefano Zampini   PetscErrorCode               ierr;
3790ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3791ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3792ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3793ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3794ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3795ed502f03SStefano Zampini   cusparseStatus_t             stat;
3796ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3797ed502f03SStefano Zampini   cudaError_t                  cerr;
3798ed502f03SStefano Zampini 
3799ed502f03SStefano Zampini   PetscFunctionBegin;
3800ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3801ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3802ed502f03SStefano Zampini   PetscValidPointer(C,4);
3803ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3804ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3805ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3806ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3807ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3808ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3809ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3810ed502f03SStefano Zampini     m     = A->rmap->n;
3811ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3812ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3813ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3814ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3815ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3816ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3817ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3818ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3819ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3820ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3821ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3822ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3823ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3824ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3825ed502f03SStefano Zampini     Ccusp->nrows    = m;
3826ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3827ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3828ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3829ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3830ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3831ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3832ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3833ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3834ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3835ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3836ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3837ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3838ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3839ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3840ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3841ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3842ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3843ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3844ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3845ed502f03SStefano Zampini 
3846ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3847ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3848ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3849ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3850ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3851ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3852ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3853ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3854ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3855ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3856ed502f03SStefano Zampini     if (c->nz) {
38572ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
38582ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
38592ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
38602ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
38612ed87e7eSStefano Zampini 
3862ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3863ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3864ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3865ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3866ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3867ed502f03SStefano Zampini         }
38682ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
38692ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3870ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3871ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3872ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3873ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3874ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3875ed502f03SStefano Zampini         }
38762ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
38772ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3878ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
38792ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
38802ed87e7eSStefano Zampini                               Aroff->data().get(),
38812ed87e7eSStefano Zampini                               Annz,
38822ed87e7eSStefano Zampini                               m,
38832ed87e7eSStefano Zampini                               Acoo->data().get(),
38842ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3885ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
38862ed87e7eSStefano Zampini                               Broff->data().get(),
3887ed502f03SStefano Zampini                               Bnnz,
3888ed502f03SStefano Zampini                               m,
38892ed87e7eSStefano Zampini                               Bcoo->data().get(),
3890ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
38912ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
38922ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
38932ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
38948909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3895ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3896ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
38978909a122SStefano Zampini #else
38988909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
38998909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39008909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
39018909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
39028909a122SStefano Zampini #endif
39032ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
39042ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
39052ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
39062ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
39072ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
39082ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3909ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
3910ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
3911ed502f03SStefano Zampini       thrust::advance(p2,Annz);
39122ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
39138909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
39148909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
39158909a122SStefano Zampini #endif
39162ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
39172ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
39182ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
39192ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
39202ed87e7eSStefano Zampini #else
39212ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
39222ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
39232ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
39242ed87e7eSStefano Zampini #endif
3925ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
39262ed87e7eSStefano Zampini                               Ccoo->data().get(),
3927ed502f03SStefano Zampini                               c->nz,
3928ed502f03SStefano Zampini                               m,
3929ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
3930ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3931ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3932ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
39332ed87e7eSStefano Zampini       delete wPerm;
39342ed87e7eSStefano Zampini       delete Acoo;
39352ed87e7eSStefano Zampini       delete Bcoo;
39362ed87e7eSStefano Zampini       delete Ccoo;
3937ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3938ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3939ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3940ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3941ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3942ed502f03SStefano Zampini #endif
3943ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3944ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3945ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3946ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
3947ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3948ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3949ed502f03SStefano Zampini 
3950ed502f03SStefano Zampini         Ccusp->transgen = PETSC_TRUE;
3951ed502f03SStefano Zampini         CmatT->cprowIndices  = NULL;
3952ed502f03SStefano Zampini         CmatT->mat = CcsrT;
3953ed502f03SStefano Zampini         CcsrT->num_rows = n;
3954ed502f03SStefano Zampini         CcsrT->num_cols = m;
3955ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
3956ed502f03SStefano Zampini 
3957ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3958ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3959ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
3960ed502f03SStefano Zampini 
3961ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3962ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
3963ed502f03SStefano Zampini         if (AT) {
3964ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3965ed502f03SStefano Zampini           thrust::advance(rT,-1);
3966ed502f03SStefano Zampini         }
3967ed502f03SStefano Zampini         if (BT) {
3968ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3969ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3970ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
3971ed502f03SStefano Zampini         }
3972ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
3973ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3974ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3975ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
3976ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3977ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3978ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3979ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3980ed502f03SStefano Zampini 
3981ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3982ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3983ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3984ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3985ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3986ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3987ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3988ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3989ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3990ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3991ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3992ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3993ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3994ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3995ed502f03SStefano Zampini #endif
3996ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
3997ed502f03SStefano Zampini       }
3998ed502f03SStefano Zampini     }
3999ed502f03SStefano Zampini 
4000ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4001ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4002ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4003ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4004ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4005ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4006ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4007ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4008ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4009ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4010ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4011ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4012ed502f03SStefano Zampini     } else {
4013ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4014ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4015ed502f03SStefano Zampini     }
4016ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4017ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4018ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4019ed502f03SStefano Zampini     c->maxnz = c->nz;
4020ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4021ed502f03SStefano Zampini     c->rmax = 0;
4022ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4023ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4024ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4025ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4026ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4027ed502f03SStefano Zampini     }
4028ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4029ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4030ed502f03SStefano Zampini     (*C)->nonzerostate++;
4031ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4032ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4033ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4034ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4035ed502f03SStefano Zampini   } else {
4036ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4037ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4038ed502f03SStefano Zampini     if (c->nz) {
4039ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4040ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4041ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4042ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4043ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4044ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4045ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4046ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4047ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4048ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4049ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4050ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4051ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4052ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4053ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4054ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4055ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4056ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4057ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4058ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4059ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4060ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4061ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4062ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4063ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4064ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4065ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4066ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4067ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4068ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4069ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4070ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4071ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4072ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4073ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4074ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4075ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4076ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4077ed502f03SStefano Zampini       }
4078ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4079ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4080ed502f03SStefano Zampini     }
4081ed502f03SStefano Zampini   }
4082ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4083ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4084ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4085ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4086ed502f03SStefano Zampini   PetscFunctionReturn(0);
4087ed502f03SStefano Zampini }
4088