xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision c215019ae02e8adaf4e6a01b1f07a8650fb5f99a)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
686fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
759ae82921SPaul Mullowney 
767f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
8357181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
8557181aedSStefano Zampini 
867e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
887e8381f9SStefano Zampini 
89*c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
90*c215019aSStefano Zampini 
91b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
92b06137fdSPaul Mullowney {
93b06137fdSPaul Mullowney   cusparseStatus_t   stat;
94b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
95b06137fdSPaul Mullowney 
96b06137fdSPaul Mullowney   PetscFunctionBegin;
97d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
98b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
9957d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
100b06137fdSPaul Mullowney   PetscFunctionReturn(0);
101b06137fdSPaul Mullowney }
102b06137fdSPaul Mullowney 
103b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
104b06137fdSPaul Mullowney {
105b06137fdSPaul Mullowney   cusparseStatus_t   stat;
106b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
107b06137fdSPaul Mullowney 
108b06137fdSPaul Mullowney   PetscFunctionBegin;
109d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1106b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11116a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11257d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11316a2e217SAlejandro Lamas Daviña     }
114b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1156b1cf21dSAlejandro Lamas Daviña   }
11657d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
117b06137fdSPaul Mullowney   PetscFunctionReturn(0);
118b06137fdSPaul Mullowney }
119b06137fdSPaul Mullowney 
120b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
121b06137fdSPaul Mullowney {
122b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1237e8381f9SStefano Zampini   PetscBool          flg;
1247e8381f9SStefano Zampini   PetscErrorCode     ierr;
125ccdfe979SStefano Zampini 
126b06137fdSPaul Mullowney   PetscFunctionBegin;
1277e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1287e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
129ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
130b06137fdSPaul Mullowney   PetscFunctionReturn(0);
131b06137fdSPaul Mullowney }
132b06137fdSPaul Mullowney 
133ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1349ae82921SPaul Mullowney {
1359ae82921SPaul Mullowney   PetscFunctionBegin;
1369ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1379ae82921SPaul Mullowney   PetscFunctionReturn(0);
1389ae82921SPaul Mullowney }
1399ae82921SPaul Mullowney 
140c708e6cdSJed Brown /*MC
141087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
142087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
143087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
144087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
145087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
146087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
147c708e6cdSJed Brown 
1489ae82921SPaul Mullowney   Level: beginner
149c708e6cdSJed Brown 
1503ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
151c708e6cdSJed Brown M*/
1529ae82921SPaul Mullowney 
15342c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1549ae82921SPaul Mullowney {
1559ae82921SPaul Mullowney   PetscErrorCode ierr;
156bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1579ae82921SPaul Mullowney 
1589ae82921SPaul Mullowney   PetscFunctionBegin;
159bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
160bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1612c7c0729SBarry Smith   (*B)->factortype = ftype;
1622c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1639ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1642205254eSKarl Rupp 
165087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16633d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1679ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1689ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
169087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
170087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
171087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1729ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
173bc3f50f2SPaul Mullowney 
174fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1753ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1769ae82921SPaul Mullowney   PetscFunctionReturn(0);
1779ae82921SPaul Mullowney }
1789ae82921SPaul Mullowney 
179bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
180ca45077fSPaul Mullowney {
181aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1826e111a19SKarl Rupp 
183ca45077fSPaul Mullowney   PetscFunctionBegin;
184ca45077fSPaul Mullowney   switch (op) {
185e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
186aa372e3fSPaul Mullowney     cusparsestruct->format = format;
187ca45077fSPaul Mullowney     break;
188e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
189aa372e3fSPaul Mullowney     cusparsestruct->format = format;
190ca45077fSPaul Mullowney     break;
191ca45077fSPaul Mullowney   default:
19236d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
193ca45077fSPaul Mullowney   }
194ca45077fSPaul Mullowney   PetscFunctionReturn(0);
195ca45077fSPaul Mullowney }
1969ae82921SPaul Mullowney 
197e057df02SPaul Mullowney /*@
198e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
199e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
200aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
201e057df02SPaul Mullowney    Not Collective
202e057df02SPaul Mullowney 
203e057df02SPaul Mullowney    Input Parameters:
2048468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20536d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2062692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
207e057df02SPaul Mullowney 
208e057df02SPaul Mullowney    Output Parameter:
209e057df02SPaul Mullowney 
210e057df02SPaul Mullowney    Level: intermediate
211e057df02SPaul Mullowney 
2128468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
213e057df02SPaul Mullowney @*/
214e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
215e057df02SPaul Mullowney {
216e057df02SPaul Mullowney   PetscErrorCode ierr;
2176e111a19SKarl Rupp 
218e057df02SPaul Mullowney   PetscFunctionBegin;
219e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
220e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
221e057df02SPaul Mullowney   PetscFunctionReturn(0);
222e057df02SPaul Mullowney }
223e057df02SPaul Mullowney 
224e6e9a74fSStefano Zampini /*@
225e589036eSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
226e6e9a74fSStefano Zampini 
227e6e9a74fSStefano Zampini    Collective on mat
228e6e9a74fSStefano Zampini 
229e6e9a74fSStefano Zampini    Input Parameters:
230e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
231e6e9a74fSStefano Zampini -  transgen - the boolean flag
232e6e9a74fSStefano Zampini 
233e6e9a74fSStefano Zampini    Level: intermediate
234e6e9a74fSStefano Zampini 
235e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
236e6e9a74fSStefano Zampini @*/
237e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
238e6e9a74fSStefano Zampini {
239e6e9a74fSStefano Zampini   PetscErrorCode ierr;
240e6e9a74fSStefano Zampini   PetscBool      flg;
241e6e9a74fSStefano Zampini 
242e6e9a74fSStefano Zampini   PetscFunctionBegin;
243e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
244e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
245e6e9a74fSStefano Zampini   if (flg) {
246e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24754da937aSStefano Zampini 
248e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
249e6e9a74fSStefano Zampini     cusp->transgen = transgen;
25054da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
25154da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
25254da937aSStefano Zampini     }
253e6e9a74fSStefano Zampini   }
254e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
255e6e9a74fSStefano Zampini }
256e6e9a74fSStefano Zampini 
2574416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2589ae82921SPaul Mullowney {
2599ae82921SPaul Mullowney   PetscErrorCode           ierr;
260e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2619ae82921SPaul Mullowney   PetscBool                flg;
262a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2636e111a19SKarl Rupp 
2649ae82921SPaul Mullowney   PetscFunctionBegin;
265e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2669ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26754da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26854da937aSStefano Zampini 
26954da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
270afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
271afb2bd1cSJunchao Zhang 
272e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
273a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
274afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
275afb2bd1cSJunchao Zhang 
2764c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
277a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
278afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
279afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
280afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
281afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
282afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
283afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
284afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
285afb2bd1cSJunchao Zhang 
286afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
287afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
288afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
289afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
290afb2bd1cSJunchao Zhang 
291afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
292afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
293afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
294afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
295afb2bd1cSJunchao Zhang    #endif
2964c87dfd4SPaul Mullowney   }
2970af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2989ae82921SPaul Mullowney   PetscFunctionReturn(0);
2999ae82921SPaul Mullowney }
3009ae82921SPaul Mullowney 
3016fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3029ae82921SPaul Mullowney {
303da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3049ae82921SPaul Mullowney   PetscErrorCode               ierr;
3059ae82921SPaul Mullowney 
3069ae82921SPaul Mullowney   PetscFunctionBegin;
307da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3089ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3099ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3109ae82921SPaul Mullowney   PetscFunctionReturn(0);
3119ae82921SPaul Mullowney }
3129ae82921SPaul Mullowney 
3136fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3149ae82921SPaul Mullowney {
315da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3169ae82921SPaul Mullowney   PetscErrorCode               ierr;
3179ae82921SPaul Mullowney 
3189ae82921SPaul Mullowney   PetscFunctionBegin;
319da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3209ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3219ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3229ae82921SPaul Mullowney   PetscFunctionReturn(0);
3239ae82921SPaul Mullowney }
3249ae82921SPaul Mullowney 
325087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
326087f3262SPaul Mullowney {
327da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
328087f3262SPaul Mullowney   PetscErrorCode               ierr;
329087f3262SPaul Mullowney 
330087f3262SPaul Mullowney   PetscFunctionBegin;
331da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
332087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
333087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
334087f3262SPaul Mullowney   PetscFunctionReturn(0);
335087f3262SPaul Mullowney }
336087f3262SPaul Mullowney 
337087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
338087f3262SPaul Mullowney {
339da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
340087f3262SPaul Mullowney   PetscErrorCode               ierr;
341087f3262SPaul Mullowney 
342087f3262SPaul Mullowney   PetscFunctionBegin;
343da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
344087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
345087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
346087f3262SPaul Mullowney   PetscFunctionReturn(0);
347087f3262SPaul Mullowney }
348087f3262SPaul Mullowney 
349087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3509ae82921SPaul Mullowney {
3519ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3529ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3539ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
354aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3559ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3569ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3579ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3589ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3599ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
360b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
36157d48284SJunchao Zhang   cudaError_t                       cerr;
3629ae82921SPaul Mullowney 
3639ae82921SPaul Mullowney   PetscFunctionBegin;
364cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
365c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3669ae82921SPaul Mullowney     try {
3679ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3689ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
369da79fbbcSStefano Zampini       if (!loTriFactor) {
3702cbc15d9SMark         PetscScalar                       *AALo;
3712cbc15d9SMark 
3722cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3739ae82921SPaul Mullowney 
3749ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
37557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3779ae82921SPaul Mullowney 
3789ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3799ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3809ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3819ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3829ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3839ae82921SPaul Mullowney         v        = aa;
3849ae82921SPaul Mullowney         vi       = aj;
3859ae82921SPaul Mullowney         offset   = 1;
3869ae82921SPaul Mullowney         rowOffset= 1;
3879ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3889ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
389e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3909ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3919ae82921SPaul Mullowney           rowOffset += nz+1;
3929ae82921SPaul Mullowney 
393580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
394580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney           offset      += nz;
3979ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3989ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3999ae82921SPaul Mullowney           offset      += 1;
4009ae82921SPaul Mullowney 
4019ae82921SPaul Mullowney           v  += nz;
4029ae82921SPaul Mullowney           vi += nz;
4039ae82921SPaul Mullowney         }
4042205254eSKarl Rupp 
405aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
406da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
407da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
408aa372e3fSPaul Mullowney         /* Create the matrix description */
40957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
41057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4111b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
412afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
413afb2bd1cSJunchao Zhang        #else
41457d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
415afb2bd1cSJunchao Zhang        #endif
41657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
418aa372e3fSPaul Mullowney 
419aa372e3fSPaul Mullowney         /* set the operation */
420aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
421aa372e3fSPaul Mullowney 
422aa372e3fSPaul Mullowney         /* set the matrix */
423aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
424aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
425aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
427aa372e3fSPaul Mullowney 
428aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
430aa372e3fSPaul Mullowney 
431aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
432aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
433aa372e3fSPaul Mullowney 
434aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
435aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
436aa372e3fSPaul Mullowney 
437afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
438da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
439afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4401b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
441afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
442afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
443afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
444afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
445afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
446afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
447afb2bd1cSJunchao Zhang       #endif
448afb2bd1cSJunchao Zhang 
449aa372e3fSPaul Mullowney         /* perform the solve analysis */
450aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
451aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
452aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
453afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4541b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
455afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
456afb2bd1cSJunchao Zhang                                #endif
457afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
458da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
459da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
460aa372e3fSPaul Mullowney 
461da79fbbcSStefano Zampini         /* assign the pointer */
462aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4632cbc15d9SMark         loTriFactor->AA_h = AALo;
46457d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
46557d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4664863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
467da79fbbcSStefano Zampini       } else { /* update values only */
4682cbc15d9SMark         if (!loTriFactor->AA_h) {
4692cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4702cbc15d9SMark         }
471da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4722cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
473da79fbbcSStefano Zampini         v        = aa;
474da79fbbcSStefano Zampini         vi       = aj;
475da79fbbcSStefano Zampini         offset   = 1;
476da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
477da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4782cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
479da79fbbcSStefano Zampini           offset      += nz;
4802cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
481da79fbbcSStefano Zampini           offset      += 1;
482da79fbbcSStefano Zampini           v  += nz;
483da79fbbcSStefano Zampini         }
4842cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
485da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
486da79fbbcSStefano Zampini       }
4879ae82921SPaul Mullowney     } catch(char *ex) {
4889ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4899ae82921SPaul Mullowney     }
4909ae82921SPaul Mullowney   }
4919ae82921SPaul Mullowney   PetscFunctionReturn(0);
4929ae82921SPaul Mullowney }
4939ae82921SPaul Mullowney 
494087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4959ae82921SPaul Mullowney {
4969ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4979ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4989ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
499aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5009ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5019ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5029ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5039ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5049ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5059ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50657d48284SJunchao Zhang   cudaError_t                       cerr;
5079ae82921SPaul Mullowney 
5089ae82921SPaul Mullowney   PetscFunctionBegin;
509cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
510c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5119ae82921SPaul Mullowney     try {
5129ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5139ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
514da79fbbcSStefano Zampini       if (!upTriFactor) {
5152cbc15d9SMark         PetscScalar *AAUp;
5162cbc15d9SMark 
5172cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5182cbc15d9SMark 
5199ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
52057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
52157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5229ae82921SPaul Mullowney 
5239ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5249ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5259ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5269ae82921SPaul Mullowney         offset = nzUpper;
5279ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5289ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5299ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5309ae82921SPaul Mullowney 
531e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5329ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5339ae82921SPaul Mullowney 
534e057df02SPaul Mullowney           /* decrement the offset */
5359ae82921SPaul Mullowney           offset -= (nz+1);
5369ae82921SPaul Mullowney 
537e057df02SPaul Mullowney           /* first, set the diagonal elements */
5389ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
53909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5409ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5419ae82921SPaul Mullowney 
542580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
543580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5449ae82921SPaul Mullowney         }
5452205254eSKarl Rupp 
546aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
547da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
548da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5492205254eSKarl Rupp 
550aa372e3fSPaul Mullowney         /* Create the matrix description */
55157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
55257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5531b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
554afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
555afb2bd1cSJunchao Zhang        #else
55657d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
557afb2bd1cSJunchao Zhang        #endif
55857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
55957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
560aa372e3fSPaul Mullowney 
561aa372e3fSPaul Mullowney         /* set the operation */
562aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
563aa372e3fSPaul Mullowney 
564aa372e3fSPaul Mullowney         /* set the matrix */
565aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
566aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
567aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
568aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
569aa372e3fSPaul Mullowney 
570aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
571aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
572aa372e3fSPaul Mullowney 
573aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
574aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
575aa372e3fSPaul Mullowney 
576aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
577aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
578aa372e3fSPaul Mullowney 
579afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
580da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
581afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5821b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
583afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
584afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
585afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
586afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
587afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
588afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
589afb2bd1cSJunchao Zhang       #endif
590afb2bd1cSJunchao Zhang 
591aa372e3fSPaul Mullowney         /* perform the solve analysis */
592aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
593aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
594aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
595afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5961b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
597afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
598afb2bd1cSJunchao Zhang                                #endif
599afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
600da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
601da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
602aa372e3fSPaul Mullowney 
603da79fbbcSStefano Zampini         /* assign the pointer */
604aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6052cbc15d9SMark         upTriFactor->AA_h = AAUp;
60657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
60757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6084863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
609da79fbbcSStefano Zampini       } else {
6102cbc15d9SMark         if (!upTriFactor->AA_h) {
6112cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6122cbc15d9SMark         }
613da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
614da79fbbcSStefano Zampini         offset = nzUpper;
615da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
616da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
617da79fbbcSStefano Zampini 
618da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
619da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
620da79fbbcSStefano Zampini 
621da79fbbcSStefano Zampini           /* decrement the offset */
622da79fbbcSStefano Zampini           offset -= (nz+1);
623da79fbbcSStefano Zampini 
624da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6252cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6262cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
627da79fbbcSStefano Zampini         }
6282cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
629da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
630da79fbbcSStefano Zampini       }
6319ae82921SPaul Mullowney     } catch(char *ex) {
6329ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6339ae82921SPaul Mullowney     }
6349ae82921SPaul Mullowney   }
6359ae82921SPaul Mullowney   PetscFunctionReturn(0);
6369ae82921SPaul Mullowney }
6379ae82921SPaul Mullowney 
638087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6399ae82921SPaul Mullowney {
6409ae82921SPaul Mullowney   PetscErrorCode               ierr;
6419ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6429ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6439ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6449ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6459ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6469ae82921SPaul Mullowney 
6479ae82921SPaul Mullowney   PetscFunctionBegin;
648da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
649087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
650087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6512205254eSKarl Rupp 
652da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
653aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6549ae82921SPaul Mullowney 
655c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
656e057df02SPaul Mullowney   /* lower triangular indices */
6579ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
658da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
659da79fbbcSStefano Zampini     const PetscInt *r;
660da79fbbcSStefano Zampini 
661da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
662aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
663aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6649ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
665da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
666da79fbbcSStefano Zampini   }
6679ae82921SPaul Mullowney 
668e057df02SPaul Mullowney   /* upper triangular indices */
6699ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
670da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
671da79fbbcSStefano Zampini     const PetscInt *c;
672da79fbbcSStefano Zampini 
673da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
674aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
675aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6769ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
677da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
678da79fbbcSStefano Zampini   }
6799ae82921SPaul Mullowney   PetscFunctionReturn(0);
6809ae82921SPaul Mullowney }
6819ae82921SPaul Mullowney 
682087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
683087f3262SPaul Mullowney {
684087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
685087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
686aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
687aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
688087f3262SPaul Mullowney   cusparseStatus_t                  stat;
689087f3262SPaul Mullowney   PetscErrorCode                    ierr;
69057d48284SJunchao Zhang   cudaError_t                       cerr;
691087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
692087f3262SPaul Mullowney   PetscScalar                       *AAUp;
693087f3262SPaul Mullowney   PetscScalar                       *AALo;
694087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
695087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
696087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
697087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
698087f3262SPaul Mullowney 
699087f3262SPaul Mullowney   PetscFunctionBegin;
700cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
701c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
702087f3262SPaul Mullowney     try {
703da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
704da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
705da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
706087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
70757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
70857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
709087f3262SPaul Mullowney 
710087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
711087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
712087f3262SPaul Mullowney         AiUp[n]=nzUpper;
713087f3262SPaul Mullowney         offset = 0;
714087f3262SPaul Mullowney         for (i=0; i<n; i++) {
715087f3262SPaul Mullowney           /* set the pointers */
716087f3262SPaul Mullowney           v  = aa + ai[i];
717087f3262SPaul Mullowney           vj = aj + ai[i];
718087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
719087f3262SPaul Mullowney 
720087f3262SPaul Mullowney           /* first, set the diagonal elements */
721087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
72209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
723087f3262SPaul Mullowney           AiUp[i]      = offset;
72409f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
725087f3262SPaul Mullowney 
726087f3262SPaul Mullowney           offset+=1;
727087f3262SPaul Mullowney           if (nz>0) {
728f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
729580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
730087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
731087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
732087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
733087f3262SPaul Mullowney             }
734087f3262SPaul Mullowney             offset+=nz;
735087f3262SPaul Mullowney           }
736087f3262SPaul Mullowney         }
737087f3262SPaul Mullowney 
738aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
739da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
740da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
741087f3262SPaul Mullowney 
742aa372e3fSPaul Mullowney         /* Create the matrix description */
74357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
74457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7451b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
746afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
747afb2bd1cSJunchao Zhang        #else
74857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
749afb2bd1cSJunchao Zhang        #endif
75057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
75157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
752087f3262SPaul Mullowney 
753aa372e3fSPaul Mullowney         /* set the matrix */
754aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
755aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
756aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
757aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
758aa372e3fSPaul Mullowney 
759aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
760aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
761aa372e3fSPaul Mullowney 
762aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
763aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
764aa372e3fSPaul Mullowney 
765aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
766aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
767aa372e3fSPaul Mullowney 
768afb2bd1cSJunchao Zhang         /* set the operation */
769afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
770afb2bd1cSJunchao Zhang 
771afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
772da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
773afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7741b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
775afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
776afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
777afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
778afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
779afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
780afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
781afb2bd1cSJunchao Zhang       #endif
782afb2bd1cSJunchao Zhang 
783aa372e3fSPaul Mullowney         /* perform the solve analysis */
784aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
785aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
786aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
787afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7881b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
789afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
790afb2bd1cSJunchao Zhang                                 #endif
791afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
792da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
793da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
794aa372e3fSPaul Mullowney 
795da79fbbcSStefano Zampini         /* assign the pointer */
796aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
797aa372e3fSPaul Mullowney 
798aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
799da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
800da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
801aa372e3fSPaul Mullowney 
802aa372e3fSPaul Mullowney         /* Create the matrix description */
80357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
80457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8051b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
807afb2bd1cSJunchao Zhang        #else
80857d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
809afb2bd1cSJunchao Zhang        #endif
81057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
81157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
812aa372e3fSPaul Mullowney 
813aa372e3fSPaul Mullowney         /* set the operation */
814aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
815aa372e3fSPaul Mullowney 
816aa372e3fSPaul Mullowney         /* set the matrix */
817aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
818aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
819aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
820aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
821aa372e3fSPaul Mullowney 
822aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
823aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
824aa372e3fSPaul Mullowney 
825aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
826aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
827aa372e3fSPaul Mullowney 
828aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
829aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
830aa372e3fSPaul Mullowney 
831afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
832da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
833afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8341b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
835afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
836afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
837afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
838afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
839afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
840afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
841afb2bd1cSJunchao Zhang       #endif
842afb2bd1cSJunchao Zhang 
843aa372e3fSPaul Mullowney         /* perform the solve analysis */
844aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
845aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
846aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
847afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8481b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
849afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
850afb2bd1cSJunchao Zhang                                 #endif
851afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
852da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
853da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
854aa372e3fSPaul Mullowney 
855da79fbbcSStefano Zampini         /* assign the pointer */
856aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
857087f3262SPaul Mullowney 
858da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
85957d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
86057d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
861da79fbbcSStefano Zampini       } else {
862da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
863da79fbbcSStefano Zampini         offset = 0;
864da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
865da79fbbcSStefano Zampini           /* set the pointers */
866da79fbbcSStefano Zampini           v  = aa + ai[i];
867da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
868da79fbbcSStefano Zampini 
869da79fbbcSStefano Zampini           /* first, set the diagonal elements */
870da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
871da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
872da79fbbcSStefano Zampini 
873da79fbbcSStefano Zampini           offset+=1;
874da79fbbcSStefano Zampini           if (nz>0) {
875da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
876da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
877da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
878da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
879da79fbbcSStefano Zampini             }
880da79fbbcSStefano Zampini             offset+=nz;
881da79fbbcSStefano Zampini           }
882da79fbbcSStefano Zampini         }
883da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
884da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
885da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
886da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
887da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
888da79fbbcSStefano Zampini       }
88957d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
89057d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
891087f3262SPaul Mullowney     } catch(char *ex) {
892087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
893087f3262SPaul Mullowney     }
894087f3262SPaul Mullowney   }
895087f3262SPaul Mullowney   PetscFunctionReturn(0);
896087f3262SPaul Mullowney }
897087f3262SPaul Mullowney 
898087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8999ae82921SPaul Mullowney {
9009ae82921SPaul Mullowney   PetscErrorCode               ierr;
901087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
902087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
903087f3262SPaul Mullowney   IS                           ip = a->row;
904087f3262SPaul Mullowney   PetscBool                    perm_identity;
905087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
906087f3262SPaul Mullowney 
907087f3262SPaul Mullowney   PetscFunctionBegin;
908da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
909087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
910da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
911aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
912aa372e3fSPaul Mullowney 
913da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
914da79fbbcSStefano Zampini 
915087f3262SPaul Mullowney   /* lower triangular indices */
916087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
917087f3262SPaul Mullowney   if (!perm_identity) {
9184e4bbfaaSStefano Zampini     IS             iip;
919da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9204e4bbfaaSStefano Zampini 
9214e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9224e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
923da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
924aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
925aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
926aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9274e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9284e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9294e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
930087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
931da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
932da79fbbcSStefano Zampini   }
933087f3262SPaul Mullowney   PetscFunctionReturn(0);
934087f3262SPaul Mullowney }
935087f3262SPaul Mullowney 
9366fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9379ae82921SPaul Mullowney {
9389ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9399ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9409ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
941b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9429ae82921SPaul Mullowney 
9439ae82921SPaul Mullowney   PetscFunctionBegin;
94457181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
9459ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
946ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
947e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9489ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9499ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
950bda325fcSPaul Mullowney   if (row_identity && col_identity) {
951bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
952bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9534e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9544e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
955bda325fcSPaul Mullowney   } else {
956bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
957bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9584e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9594e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
960bda325fcSPaul Mullowney   }
9618dc1d2a3SPaul Mullowney 
962e057df02SPaul Mullowney   /* get the triangular factors */
963087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9649ae82921SPaul Mullowney   PetscFunctionReturn(0);
9659ae82921SPaul Mullowney }
9669ae82921SPaul Mullowney 
967087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
968087f3262SPaul Mullowney {
969087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
970087f3262SPaul Mullowney   IS             ip = b->row;
971087f3262SPaul Mullowney   PetscBool      perm_identity;
972b175d8bbSPaul Mullowney   PetscErrorCode ierr;
973087f3262SPaul Mullowney 
974087f3262SPaul Mullowney   PetscFunctionBegin;
97557181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
976087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
977ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
978087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
979087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
980087f3262SPaul Mullowney   if (perm_identity) {
981087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
982087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9834e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9844e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
985087f3262SPaul Mullowney   } else {
986087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
987087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9884e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9894e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
990087f3262SPaul Mullowney   }
991087f3262SPaul Mullowney 
992087f3262SPaul Mullowney   /* get the triangular factors */
993087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
994087f3262SPaul Mullowney   PetscFunctionReturn(0);
995087f3262SPaul Mullowney }
9969ae82921SPaul Mullowney 
997b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
998bda325fcSPaul Mullowney {
999bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1000aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1001aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1002da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1003da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1004bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1005aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1006aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1007aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1008aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10091b0a6780SStefano Zampini   cudaError_t                       cerr;
1010da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1011b175d8bbSPaul Mullowney 
1012bda325fcSPaul Mullowney   PetscFunctionBegin;
1013aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1014da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1015da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1018aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1019aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1020aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1021aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1022aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1023aa372e3fSPaul Mullowney 
1024aa372e3fSPaul Mullowney   /* Create the matrix description */
102557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102757d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1030aa372e3fSPaul Mullowney 
1031aa372e3fSPaul Mullowney   /* set the operation */
1032aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1033aa372e3fSPaul Mullowney 
1034aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1035aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1036afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1037afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1038aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1039afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1040afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1041afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1042aa372e3fSPaul Mullowney 
1043aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1044afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1046afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1047afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1048afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1049afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1050afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1051afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1053afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10541b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1055afb2bd1cSJunchao Zhang #endif
1056afb2bd1cSJunchao Zhang 
1057da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1058aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1059aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1060aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1061aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1062aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1063aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1064afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1065afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1066afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1067afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1068afb2bd1cSJunchao Zhang                         #else
1069afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1070afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1071afb2bd1cSJunchao Zhang                         #endif
1072afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1073da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1074da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1075aa372e3fSPaul Mullowney 
1076afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1077da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1078afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10791b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1081afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1082afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1083afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1084afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1085afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1086afb2bd1cSJunchao Zhang #endif
1087afb2bd1cSJunchao Zhang 
1088afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1089aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1090afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1091afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1092afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10931b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1094afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1095afb2bd1cSJunchao Zhang                           #endif
1096afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1097da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1098da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1099aa372e3fSPaul Mullowney 
1100da79fbbcSStefano Zampini   /* assign the pointer */
1101aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1102aa372e3fSPaul Mullowney 
1103aa372e3fSPaul Mullowney   /*********************************************/
1104aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1105aa372e3fSPaul Mullowney   /*********************************************/
1106aa372e3fSPaul Mullowney 
1107aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1108da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1109da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1110aa372e3fSPaul Mullowney 
1111aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1112aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1113aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1114aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1115aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1116aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1117aa372e3fSPaul Mullowney 
1118aa372e3fSPaul Mullowney   /* Create the matrix description */
111957d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112057d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112157d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112257d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112357d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1124aa372e3fSPaul Mullowney 
1125aa372e3fSPaul Mullowney   /* set the operation */
1126aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1127aa372e3fSPaul Mullowney 
1128aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1129aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1130afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1131afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1132aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1133afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1134afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1135afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1136aa372e3fSPaul Mullowney 
1137aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1138afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1140afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1141afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1142afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1143afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1144afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1145afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1147afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1148afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1149afb2bd1cSJunchao Zhang #endif
1150afb2bd1cSJunchao Zhang 
1151da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1152aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1153aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1154aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1155aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1156aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1157aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1158afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1159afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1160afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1161afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1162afb2bd1cSJunchao Zhang                         #else
1163afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1164afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1165afb2bd1cSJunchao Zhang                         #endif
1166afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1167da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1168da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1169aa372e3fSPaul Mullowney 
1170afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1171da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1172afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11731b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1175afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1176afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1177afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1178afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1179afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1180afb2bd1cSJunchao Zhang   #endif
1181afb2bd1cSJunchao Zhang 
1182afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1183aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1184afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1185afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1186afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11871b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1188afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1189afb2bd1cSJunchao Zhang                           #endif
1190afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1191da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1192da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1193aa372e3fSPaul Mullowney 
1194da79fbbcSStefano Zampini   /* assign the pointer */
1195aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1196bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1197bda325fcSPaul Mullowney }
1198bda325fcSPaul Mullowney 
1199b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1200bda325fcSPaul Mullowney {
1201aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1202aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1203aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1204bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1205bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1206aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1207b06137fdSPaul Mullowney   cudaError_t                  err;
120885ba7357SStefano Zampini   PetscErrorCode               ierr;
1209b175d8bbSPaul Mullowney 
1210bda325fcSPaul Mullowney   PetscFunctionBegin;
1211fcdce8c4SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
121285ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
121385ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
121485ba7357SStefano Zampini   /* create cusparse matrix */
1215aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
121657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1217aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
121857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
121957d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1220aa372e3fSPaul Mullowney 
1221b06137fdSPaul Mullowney   /* set alpha and beta */
1222afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12237656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12247656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1225afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12267656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12277656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
122857d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1229b06137fdSPaul Mullowney 
1230aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1231aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1232aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1233554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1234554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1235aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1236a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1237aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1238aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1239a3fdcf43SKarl Rupp 
1240039c6fbaSStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
124181902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1242afb2bd1cSJunchao Zhang 
124381902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1244afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1246afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1247afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1248afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1249afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1250afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1251afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1252afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1253afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1254afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1255afb2bd1cSJunchao Zhang    #endif
1256afb2bd1cSJunchao Zhang 
1257a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1258a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1259aa372e3fSPaul Mullowney                             matrix->values->data().get(),
126081902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1261aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1262aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1263afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1264afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1265afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1266afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1267afb2bd1cSJunchao Zhang                           #else
1268afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1269afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1270afb2bd1cSJunchao Zhang                           #endif
1271afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1272aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1273afb2bd1cSJunchao Zhang 
1274afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1275afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1276afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1277afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1278afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1279afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1280afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1281afb2bd1cSJunchao Zhang    #endif
1282aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1283afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1284afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1285afb2bd1cSJunchao Zhang    #else
1286aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
128751c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
128851c6d536SStefano Zampini     /* First convert HYB to CSR */
1289aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1290aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1291aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1292aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1293aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1294aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1295aa372e3fSPaul Mullowney 
1296aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1297aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1298aa372e3fSPaul Mullowney                             temp->values->data().get(),
1299aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
130057d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1303aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1304aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1305aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1306aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1307aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1308aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1309aa372e3fSPaul Mullowney 
1310aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1311aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1312aa372e3fSPaul Mullowney                             temp->values->data().get(),
1313aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1314aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1315aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1316aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1317aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
131857d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1319aa372e3fSPaul Mullowney 
1320aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1321aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
132257d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1323aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1324aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1325aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1326aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1327aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1328aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
132957d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1330aa372e3fSPaul Mullowney 
1331aa372e3fSPaul Mullowney     /* assign the pointer */
1332aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1333aa372e3fSPaul Mullowney     /* delete temporaries */
1334aa372e3fSPaul Mullowney     if (tempT) {
1335aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1336aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1337aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1338aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1339087f3262SPaul Mullowney     }
1340aa372e3fSPaul Mullowney     if (temp) {
1341aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1342aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1343aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1344aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1345aa372e3fSPaul Mullowney     }
1346afb2bd1cSJunchao Zhang    #endif
1347aa372e3fSPaul Mullowney   }
134805035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
134985ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
135085ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1351213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1352213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1353aa372e3fSPaul Mullowney   /* assign the pointer */
1354aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1355bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1356bda325fcSPaul Mullowney }
1357bda325fcSPaul Mullowney 
13584e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13596fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1360bda325fcSPaul Mullowney {
1361c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1362465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1363465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1364465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1365465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1366bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1367bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1368aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1369aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1370aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1371b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
137257d48284SJunchao Zhang   cudaError_t                           cerr;
1373bda325fcSPaul Mullowney 
1374bda325fcSPaul Mullowney   PetscFunctionBegin;
1375aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1376aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1377bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1378aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1379aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1380bda325fcSPaul Mullowney   }
1381bda325fcSPaul Mullowney 
1382bda325fcSPaul Mullowney   /* Get the GPU pointers */
1383c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1384c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1385c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1386c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1387bda325fcSPaul Mullowney 
13887a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1389aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1390c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1391c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1392c41cb2e2SAlejandro Lamas Daviña                xGPU);
1393aa372e3fSPaul Mullowney 
1394aa372e3fSPaul Mullowney   /* First, solve U */
1395aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1396afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13971b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1398afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1399afb2bd1cSJunchao Zhang                       #endif
1400afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1401aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1402aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1403aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1404aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1405afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14061b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1407afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1408afb2bd1cSJunchao Zhang                       #endif
1409afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1410aa372e3fSPaul Mullowney 
1411aa372e3fSPaul Mullowney   /* Then, solve L */
1412aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1413afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14141b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1415afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1416afb2bd1cSJunchao Zhang                       #endif
1417afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1418aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1419aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1420aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1421aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1422afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14231b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1424afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1425afb2bd1cSJunchao Zhang                       #endif
1426afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1427aa372e3fSPaul Mullowney 
1428aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1429c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1430c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1431aa372e3fSPaul Mullowney                tempGPU->begin());
1432aa372e3fSPaul Mullowney 
1433aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1434c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1435bda325fcSPaul Mullowney 
1436bda325fcSPaul Mullowney   /* restore */
1437c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1438c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
143905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1440661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1441958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1442bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1443bda325fcSPaul Mullowney }
1444bda325fcSPaul Mullowney 
14456fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1446bda325fcSPaul Mullowney {
1447465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1448465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1449bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1450bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1451aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1452aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1453aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1454b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
145557d48284SJunchao Zhang   cudaError_t                       cerr;
1456bda325fcSPaul Mullowney 
1457bda325fcSPaul Mullowney   PetscFunctionBegin;
1458aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1459aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1460bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1461aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1462aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1463bda325fcSPaul Mullowney   }
1464bda325fcSPaul Mullowney 
1465bda325fcSPaul Mullowney   /* Get the GPU pointers */
1466c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1467c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1468bda325fcSPaul Mullowney 
14697a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1470aa372e3fSPaul Mullowney   /* First, solve U */
1471aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1472afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14731b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1474afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1475afb2bd1cSJunchao Zhang                       #endif
1476afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1477aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1478aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1479aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1480aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1481afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
14821b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1483afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1484afb2bd1cSJunchao Zhang                       #endif
1485afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1486aa372e3fSPaul Mullowney 
1487aa372e3fSPaul Mullowney   /* Then, solve L */
1488aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1489afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14901b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1491afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1492afb2bd1cSJunchao Zhang                       #endif
1493afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1494aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1495aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1496aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1497aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1498afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14991b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1500afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1501afb2bd1cSJunchao Zhang                       #endif
1502afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1503bda325fcSPaul Mullowney 
1504bda325fcSPaul Mullowney   /* restore */
1505c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1506c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
150705035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1508661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1509958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1510bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1511bda325fcSPaul Mullowney }
1512bda325fcSPaul Mullowney 
15136fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15149ae82921SPaul Mullowney {
1515465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1516465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1517465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1518465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15199ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15209ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1521aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1522aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1523aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1524b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
152557d48284SJunchao Zhang   cudaError_t                           cerr;
15269ae82921SPaul Mullowney 
15279ae82921SPaul Mullowney   PetscFunctionBegin;
1528ebc8f436SDominic Meiser 
1529e057df02SPaul Mullowney   /* Get the GPU pointers */
1530c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1531c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1532c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1533c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15349ae82921SPaul Mullowney 
15357a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1536aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1537c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1538c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15394e4bbfaaSStefano Zampini                tempGPU->begin());
1540aa372e3fSPaul Mullowney 
1541aa372e3fSPaul Mullowney   /* Next, solve L */
1542aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1543afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15441b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1545afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1546afb2bd1cSJunchao Zhang                       #endif
1547afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1548aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1549aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1550aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1551aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1552afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15531b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1554afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1555afb2bd1cSJunchao Zhang                       #endif
1556afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1557aa372e3fSPaul Mullowney 
1558aa372e3fSPaul Mullowney   /* Then, solve U */
1559aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1560afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1563afb2bd1cSJunchao Zhang                       #endif
1564afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1565aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1566aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1567aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1568aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1569afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
15701b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1571afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1572afb2bd1cSJunchao Zhang                       #endif
1573afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1574aa372e3fSPaul Mullowney 
15754e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
15764e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15774e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15784e4bbfaaSStefano Zampini                xGPU);
15799ae82921SPaul Mullowney 
1580c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1581c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
158205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1583661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1584958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15859ae82921SPaul Mullowney   PetscFunctionReturn(0);
15869ae82921SPaul Mullowney }
15879ae82921SPaul Mullowney 
15886fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15899ae82921SPaul Mullowney {
1590465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1591465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15929ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15939ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1594aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1595aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1596aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1597b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
159857d48284SJunchao Zhang   cudaError_t                       cerr;
15999ae82921SPaul Mullowney 
16009ae82921SPaul Mullowney   PetscFunctionBegin;
1601e057df02SPaul Mullowney   /* Get the GPU pointers */
1602c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1603c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16049ae82921SPaul Mullowney 
16057a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1606aa372e3fSPaul Mullowney   /* First, solve L */
1607aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1608afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16091b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1610afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1611afb2bd1cSJunchao Zhang                       #endif
1612afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1613aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1614aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1615aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1616aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1617afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16181b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1620afb2bd1cSJunchao Zhang                       #endif
1621afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1622aa372e3fSPaul Mullowney 
1623aa372e3fSPaul Mullowney   /* Next, solve U */
1624aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1625afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16261b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1627afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1628afb2bd1cSJunchao Zhang                       #endif
1629afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1630aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1631aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1632aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1633aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1634afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16351b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1636afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1637afb2bd1cSJunchao Zhang                       #endif
1638afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16399ae82921SPaul Mullowney 
1640c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1641c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
164205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1643661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1644958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16459ae82921SPaul Mullowney   PetscFunctionReturn(0);
16469ae82921SPaul Mullowney }
16479ae82921SPaul Mullowney 
16487e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16497e8381f9SStefano Zampini {
16507e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16517e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16527e8381f9SStefano Zampini   cudaError_t        cerr;
16537e8381f9SStefano Zampini   PetscErrorCode     ierr;
16547e8381f9SStefano Zampini 
16557e8381f9SStefano Zampini   PetscFunctionBegin;
16567e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16577e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16587e8381f9SStefano Zampini 
16597e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16607e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16617e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16627e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
16637e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16647e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
16657e8381f9SStefano Zampini   }
16667e8381f9SStefano Zampini   PetscFunctionReturn(0);
16677e8381f9SStefano Zampini }
16687e8381f9SStefano Zampini 
16697e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
16707e8381f9SStefano Zampini {
16717e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
16727e8381f9SStefano Zampini   PetscErrorCode ierr;
16737e8381f9SStefano Zampini 
16747e8381f9SStefano Zampini   PetscFunctionBegin;
16757e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
16767e8381f9SStefano Zampini   *array = a->a;
16777e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
16787e8381f9SStefano Zampini   PetscFunctionReturn(0);
16797e8381f9SStefano Zampini }
16807e8381f9SStefano Zampini 
16816fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
16829ae82921SPaul Mullowney {
1683aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
16847c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
16859ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1686213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
16879ae82921SPaul Mullowney   PetscErrorCode               ierr;
1688aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1689abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1690b06137fdSPaul Mullowney   cudaError_t                  err;
16919ae82921SPaul Mullowney 
16929ae82921SPaul Mullowney   PetscFunctionBegin;
1693fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1694c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1695fcdce8c4SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
169681902715SJunchao Zhang       /* Copy values only */
1697afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1698afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
169985ba7357SStefano Zampini 
1700abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
170185ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1702afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
170305035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17044863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
170585ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
170681902715SJunchao Zhang 
170781902715SJunchao Zhang       /* Update matT when it was built before */
170881902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
170981902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1710afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
171185ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
171281902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1713afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1714afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
171581902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1716afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1717afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1718afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1719afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1720afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1721afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1722afb2bd1cSJunchao Zhang                           #else
1723afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1724afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1725afb2bd1cSJunchao Zhang                           #endif
1726afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
172705035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
172885ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
172981902715SJunchao Zhang       }
173034d6c7a5SJose E. Roman     } else {
1731abb89eb1SStefano Zampini       PetscInt nnz;
173285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17337c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
17347c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
17357c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
173681902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
17379ae82921SPaul Mullowney       try {
17389ae82921SPaul Mullowney         if (a->compressedrow.use) {
17399ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17409ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17419ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17429ae82921SPaul Mullowney         } else {
1743213423ffSJunchao Zhang           m    = A->rmap->n;
1744213423ffSJunchao Zhang           ii   = a->i;
1745e6e9a74fSStefano Zampini           ridx = NULL;
17469ae82921SPaul Mullowney         }
1747abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1748abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1749abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1750abb89eb1SStefano Zampini         else nnz = a->nz;
17519ae82921SPaul Mullowney 
175285ba7357SStefano Zampini         /* create cusparse matrix */
1753abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1754aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
175557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
175657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
175757d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17589ae82921SPaul Mullowney 
1759afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17607656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17617656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1762afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17637656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17647656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
176557d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1766b06137fdSPaul Mullowney 
1767aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1768aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1769aa372e3fSPaul Mullowney           /* set the matrix */
1770afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1771afb2bd1cSJunchao Zhang           mat->num_rows = m;
1772afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1773abb89eb1SStefano Zampini           mat->num_entries = nnz;
1774afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1775afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17769ae82921SPaul Mullowney 
1777abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1778abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1779aa372e3fSPaul Mullowney 
1780abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1781abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1782aa372e3fSPaul Mullowney 
1783aa372e3fSPaul Mullowney           /* assign the pointer */
1784afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1785afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1786afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1787afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1788afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1789afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1790afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1791afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1792afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1793afb2bd1cSJunchao Zhang           }
1794afb2bd1cSJunchao Zhang          #endif
1795aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1796afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1797afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1798afb2bd1cSJunchao Zhang          #else
1799afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1800afb2bd1cSJunchao Zhang           mat->num_rows = m;
1801afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1802abb89eb1SStefano Zampini           mat->num_entries = nnz;
1803afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1804afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1805aa372e3fSPaul Mullowney 
1806abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1807abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1808aa372e3fSPaul Mullowney 
1809abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1810abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1811aa372e3fSPaul Mullowney 
1812aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
181357d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1814aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1815aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1816afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1817afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1818afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1819afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
182057d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1821aa372e3fSPaul Mullowney           /* assign the pointer */
1822aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1823aa372e3fSPaul Mullowney 
1824afb2bd1cSJunchao Zhang           if (mat) {
1825afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1826afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1827afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1828afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1829087f3262SPaul Mullowney           }
1830afb2bd1cSJunchao Zhang          #endif
1831087f3262SPaul Mullowney         }
1832ca45077fSPaul Mullowney 
1833aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1834213423ffSJunchao Zhang         if (a->compressedrow.use) {
1835213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1836aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1837aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1838213423ffSJunchao Zhang           tmp = m;
1839213423ffSJunchao Zhang         } else {
1840213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1841213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1842213423ffSJunchao Zhang           tmp = 0;
1843213423ffSJunchao Zhang         }
1844213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1845aa372e3fSPaul Mullowney 
1846aa372e3fSPaul Mullowney         /* assign the pointer */
1847aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18489ae82921SPaul Mullowney       } catch(char *ex) {
18499ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18509ae82921SPaul Mullowney       }
185105035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
185285ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
185334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
185434d6c7a5SJose E. Roman     }
1855abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18569ae82921SPaul Mullowney   }
18579ae82921SPaul Mullowney   PetscFunctionReturn(0);
18589ae82921SPaul Mullowney }
18599ae82921SPaul Mullowney 
1860c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1861aa372e3fSPaul Mullowney {
1862aa372e3fSPaul Mullowney   template <typename Tuple>
1863aa372e3fSPaul Mullowney   __host__ __device__
1864aa372e3fSPaul Mullowney   void operator()(Tuple t)
1865aa372e3fSPaul Mullowney   {
1866aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1867aa372e3fSPaul Mullowney   }
1868aa372e3fSPaul Mullowney };
1869aa372e3fSPaul Mullowney 
18707e8381f9SStefano Zampini struct VecCUDAEquals
18717e8381f9SStefano Zampini {
18727e8381f9SStefano Zampini   template <typename Tuple>
18737e8381f9SStefano Zampini   __host__ __device__
18747e8381f9SStefano Zampini   void operator()(Tuple t)
18757e8381f9SStefano Zampini   {
18767e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18777e8381f9SStefano Zampini   }
18787e8381f9SStefano Zampini };
18797e8381f9SStefano Zampini 
1880e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1881e6e9a74fSStefano Zampini {
1882e6e9a74fSStefano Zampini   template <typename Tuple>
1883e6e9a74fSStefano Zampini   __host__ __device__
1884e6e9a74fSStefano Zampini   void operator()(Tuple t)
1885e6e9a74fSStefano Zampini   {
1886e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1887e6e9a74fSStefano Zampini   }
1888e6e9a74fSStefano Zampini };
1889e6e9a74fSStefano Zampini 
1890afb2bd1cSJunchao Zhang struct MatMatCusparse {
1891ccdfe979SStefano Zampini   PetscBool             cisdense;
1892ccdfe979SStefano Zampini   PetscScalar           *Bt;
1893ccdfe979SStefano Zampini   Mat                   X;
1894fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1895fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1896fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1897afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1898fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1899afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1900afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1901afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1902afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1903fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1904fcdce8c4SStefano Zampini   void                  *mmBuffer;
1905fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1906fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1907afb2bd1cSJunchao Zhang #endif
1908afb2bd1cSJunchao Zhang };
1909ccdfe979SStefano Zampini 
1910ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1911ccdfe979SStefano Zampini {
1912ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1913ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1914ccdfe979SStefano Zampini   cudaError_t      cerr;
1915fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1916fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1917fcdce8c4SStefano Zampini  #endif
1918ccdfe979SStefano Zampini 
1919ccdfe979SStefano Zampini   PetscFunctionBegin;
1920ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1921fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1922afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1923fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1924fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1925fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1926afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1927afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1928fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1929afb2bd1cSJunchao Zhang  #endif
1930ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1931ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1932ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1933ccdfe979SStefano Zampini }
1934ccdfe979SStefano Zampini 
1935ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1936ccdfe979SStefano Zampini 
1937ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1938ccdfe979SStefano Zampini {
1939ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1940ccdfe979SStefano Zampini   Mat                          A,B;
1941afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1942ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1943ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1944ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1945ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1946ccdfe979SStefano Zampini   const PetscScalar            *barray;
1947ccdfe979SStefano Zampini   PetscScalar                  *carray;
1948ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1949ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1950ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1951ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1952afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1953ccdfe979SStefano Zampini 
1954ccdfe979SStefano Zampini   PetscFunctionBegin;
1955ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1956ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1957ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1958ccdfe979SStefano Zampini   A    = product->A;
1959ccdfe979SStefano Zampini   B    = product->B;
1960ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1961ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1962ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1963ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1964ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1965ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1966ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1967ccdfe979SStefano Zampini   switch (product->type) {
1968ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1969ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1970ccdfe979SStefano Zampini     mat = cusp->mat;
1971ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1972ccdfe979SStefano Zampini     m   = A->rmap->n;
1973ccdfe979SStefano Zampini     n   = B->cmap->n;
1974ccdfe979SStefano Zampini     break;
1975ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1976e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1977e6e9a74fSStefano Zampini       mat = cusp->mat;
1978e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1979e6e9a74fSStefano Zampini     } else {
1980ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1981ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1982ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1983e6e9a74fSStefano Zampini     }
1984ccdfe979SStefano Zampini     m = A->cmap->n;
1985ccdfe979SStefano Zampini     n = B->cmap->n;
1986ccdfe979SStefano Zampini     break;
1987ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1988ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1989ccdfe979SStefano Zampini     mat = cusp->mat;
1990ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1991ccdfe979SStefano Zampini     m   = A->rmap->n;
1992ccdfe979SStefano Zampini     n   = B->rmap->n;
1993ccdfe979SStefano Zampini     break;
1994ccdfe979SStefano Zampini   default:
1995ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1996ccdfe979SStefano Zampini   }
1997ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1998ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1999ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2000ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2001afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2002ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2003afb2bd1cSJunchao Zhang 
2004ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2005c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2006c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2007c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2008c8378d12SStefano Zampini   } else {
2009c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2010c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2011c8378d12SStefano Zampini   }
2012c8378d12SStefano Zampini 
2013c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2014afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2015afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2016fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2017afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2018fcdce8c4SStefano Zampini     size_t mmBufferSize;
2019afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2020afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2021afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2022afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2023afb2bd1cSJunchao Zhang     }
2024c8378d12SStefano Zampini 
2025afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2026afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2027afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2028afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2029afb2bd1cSJunchao Zhang     }
2030afb2bd1cSJunchao Zhang 
2031afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2032afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2033afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2034afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2035afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2036afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2037afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2038afb2bd1cSJunchao Zhang     }
2039afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2040afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2041afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2042fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2043fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2044fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2045fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2046fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2047fcdce8c4SStefano Zampini     }
2048afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2049afb2bd1cSJunchao Zhang   } else {
2050afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2051afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2052afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2053afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2054afb2bd1cSJunchao Zhang   }
2055afb2bd1cSJunchao Zhang 
2056afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2057afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2058afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2059afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2060fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2061afb2bd1cSJunchao Zhang  #else
2062afb2bd1cSJunchao Zhang   PetscInt k;
2063afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2064ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2065ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2066ccdfe979SStefano Zampini     cublasStatus_t cerr;
2067ccdfe979SStefano Zampini 
2068ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2069ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2070ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2071ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2072ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2073ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2074ccdfe979SStefano Zampini     blda = B->cmap->n;
2075afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2076afb2bd1cSJunchao Zhang   } else {
2077afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2078ccdfe979SStefano Zampini   }
2079ccdfe979SStefano Zampini 
2080afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2081ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2082afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2083ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2084ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2085ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2086ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2087ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2088afb2bd1cSJunchao Zhang  #endif
2089afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2090c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2091c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2092ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2093ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2094ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2095ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2096ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2097ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2098ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2099ccdfe979SStefano Zampini   } else {
2100ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2101ccdfe979SStefano Zampini   }
2102ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2103ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2104ccdfe979SStefano Zampini   }
2105ccdfe979SStefano Zampini   if (!biscuda) {
2106ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2107ccdfe979SStefano Zampini   }
2108ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2109ccdfe979SStefano Zampini }
2110ccdfe979SStefano Zampini 
2111ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2112ccdfe979SStefano Zampini {
2113ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2114ccdfe979SStefano Zampini   Mat                A,B;
2115ccdfe979SStefano Zampini   PetscInt           m,n;
2116ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2117ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2118ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2119ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2120ccdfe979SStefano Zampini 
2121ccdfe979SStefano Zampini   PetscFunctionBegin;
2122ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2123ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2124ccdfe979SStefano Zampini   A    = product->A;
2125ccdfe979SStefano Zampini   B    = product->B;
2126ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2127ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2128ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2129ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2130ccdfe979SStefano Zampini   switch (product->type) {
2131ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2132ccdfe979SStefano Zampini     m = A->rmap->n;
2133ccdfe979SStefano Zampini     n = B->cmap->n;
2134ccdfe979SStefano Zampini     break;
2135ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2136ccdfe979SStefano Zampini     m = A->cmap->n;
2137ccdfe979SStefano Zampini     n = B->cmap->n;
2138ccdfe979SStefano Zampini     break;
2139ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2140ccdfe979SStefano Zampini     m = A->rmap->n;
2141ccdfe979SStefano Zampini     n = B->rmap->n;
2142ccdfe979SStefano Zampini     break;
2143ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2144ccdfe979SStefano Zampini     m = B->cmap->n;
2145ccdfe979SStefano Zampini     n = B->cmap->n;
2146ccdfe979SStefano Zampini     break;
2147ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2148ccdfe979SStefano Zampini     m = B->rmap->n;
2149ccdfe979SStefano Zampini     n = B->rmap->n;
2150ccdfe979SStefano Zampini     break;
2151ccdfe979SStefano Zampini   default:
2152ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2153ccdfe979SStefano Zampini   }
2154ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2155ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2156ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2157ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2158ccdfe979SStefano Zampini 
2159ccdfe979SStefano Zampini   /* product data */
2160ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2161ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2162afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2163afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2164ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2165afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2166ccdfe979SStefano Zampini   }
2167afb2bd1cSJunchao Zhang  #endif
2168ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2169ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2170ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2171ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2172ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2173ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2174ccdfe979SStefano Zampini     } else {
2175ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2176ccdfe979SStefano Zampini     }
2177ccdfe979SStefano Zampini   }
2178ccdfe979SStefano Zampini   C->product->data    = mmdata;
2179ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2180ccdfe979SStefano Zampini 
2181ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2182ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2183ccdfe979SStefano Zampini }
2184ccdfe979SStefano Zampini 
2185fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2186ccdfe979SStefano Zampini {
2187ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2188fcdce8c4SStefano Zampini   Mat                          A,B;
2189fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2190fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2191fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2192fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2193fcdce8c4SStefano Zampini   PetscBool                    flg;
2194ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2195fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2196fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2197fcdce8c4SStefano Zampini   MatProductType               ptype;
2198fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2199fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2200fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2201fcdce8c4SStefano Zampini #endif
2202ccdfe979SStefano Zampini 
2203ccdfe979SStefano Zampini   PetscFunctionBegin;
2204ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2205fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2206fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2207fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2208fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2209fcdce8c4SStefano Zampini   A = product->A;
2210fcdce8c4SStefano Zampini   B = product->B;
2211fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2212fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2213fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2214fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2215fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2216fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2217fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2218fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2219fcdce8c4SStefano Zampini     goto finalize;
2220fcdce8c4SStefano Zampini   }
2221fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2222fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2223fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2224fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2225fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2226fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2227fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2228fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2229fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2230fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2231fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2232fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2233fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2234fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2235fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2236fcdce8c4SStefano Zampini 
2237fcdce8c4SStefano Zampini   ptype = product->type;
2238fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2239fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2240fcdce8c4SStefano Zampini   switch (ptype) {
2241fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2242fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2243fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2244fcdce8c4SStefano Zampini     break;
2245fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2246fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2247fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2248fcdce8c4SStefano Zampini     break;
2249fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2250fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2251fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2252fcdce8c4SStefano Zampini     break;
2253fcdce8c4SStefano Zampini   default:
2254fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2255fcdce8c4SStefano Zampini   }
2256fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2257fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2258fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2259fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2260fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2261fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2262fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2263fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2264fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2265fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2266fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2267fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2268fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2269fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2270fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2271fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2272fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2273fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2274fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2275fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2276fcdce8c4SStefano Zampini #else
2277fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2278fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2279fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2280fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2281fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2282fcdce8c4SStefano Zampini #endif
2283fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2284fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2285fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2286fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2287fcdce8c4SStefano Zampini finalize:
2288fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2289fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2290fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2291fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2292fcdce8c4SStefano Zampini   c->reallocs         = 0;
2293fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2294fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2295fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2296fcdce8c4SStefano Zampini   C->num_ass++;
2297ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2298ccdfe979SStefano Zampini }
2299fcdce8c4SStefano Zampini 
2300fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2301fcdce8c4SStefano Zampini {
2302fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2303fcdce8c4SStefano Zampini   Mat                          A,B;
2304fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2305fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2306fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2307fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2308fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2309fcdce8c4SStefano Zampini   PetscBool                    flg;
2310fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2311fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2312fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2313fcdce8c4SStefano Zampini   MatProductType               ptype;
2314fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2315fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2316fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2317fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2318fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2319fcdce8c4SStefano Zampini   size_t                       bufSize2;
2320fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2321fcdce8c4SStefano Zampini #else
2322fcdce8c4SStefano Zampini   int                          cnz;
2323fcdce8c4SStefano Zampini #endif
2324fcdce8c4SStefano Zampini 
2325fcdce8c4SStefano Zampini   PetscFunctionBegin;
2326fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2327fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2328fcdce8c4SStefano Zampini   A    = product->A;
2329fcdce8c4SStefano Zampini   B    = product->B;
2330fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2331fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2332fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2333fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2334fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2335fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2336fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2337fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2338fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2339fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2340fcdce8c4SStefano Zampini 
2341fcdce8c4SStefano Zampini   /* product data */
2342fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2343fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2344fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2345fcdce8c4SStefano Zampini 
2346fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2347fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2348fcdce8c4SStefano Zampini   ptype = product->type;
2349fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2350fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2351fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2352fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2353fcdce8c4SStefano Zampini   switch (ptype) {
2354fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2355fcdce8c4SStefano Zampini     m = A->rmap->n;
2356fcdce8c4SStefano Zampini     n = B->cmap->n;
2357fcdce8c4SStefano Zampini     k = A->cmap->n;
2358fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2359fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2360fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2361fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2362fcdce8c4SStefano Zampini     break;
2363fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2364fcdce8c4SStefano Zampini     m = A->cmap->n;
2365fcdce8c4SStefano Zampini     n = B->cmap->n;
2366fcdce8c4SStefano Zampini     k = A->rmap->n;
2367fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2368fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2369fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2370fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2371fcdce8c4SStefano Zampini     break;
2372fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2373fcdce8c4SStefano Zampini     m = A->rmap->n;
2374fcdce8c4SStefano Zampini     n = B->rmap->n;
2375fcdce8c4SStefano Zampini     k = A->cmap->n;
2376fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2377fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2378fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2379fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2380fcdce8c4SStefano Zampini     break;
2381fcdce8c4SStefano Zampini   default:
2382fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2383fcdce8c4SStefano Zampini   }
2384fcdce8c4SStefano Zampini 
2385fcdce8c4SStefano Zampini   /* create cusparse matrix */
2386fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2387fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2388fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2389fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2390fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2391fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2392fcdce8c4SStefano Zampini 
2393fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2394fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2395fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2396fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2397fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2398fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2399fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2400fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2401fcdce8c4SStefano Zampini   } else {
2402fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2403fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2404fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2405fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2406fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2407fcdce8c4SStefano Zampini   }
2408fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2409fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2410fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2411fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2412fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2413fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2414fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2415fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2416fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2417fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2418fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2419fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2420fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2421fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2422fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2423fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2424fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2425fcdce8c4SStefano Zampini     c->nz = 0;
2426fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2427fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2428fcdce8c4SStefano Zampini     goto finalizesym;
2429fcdce8c4SStefano Zampini   }
2430fcdce8c4SStefano Zampini 
2431fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2432fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2433fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2434fcdce8c4SStefano Zampini   if (!biscompressed) {
2435fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2436fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2437fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2438fcdce8c4SStefano Zampini #endif
2439fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2440fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2441fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2442fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2443fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2444fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2445fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2446fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2447fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2448fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2449fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2450fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2451fcdce8c4SStefano Zampini     }
2452fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2453fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2454fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2455fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2456fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2457fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2458fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2459fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2460fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2461fcdce8c4SStefano Zampini     }
2462fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2463fcdce8c4SStefano Zampini #endif
2464fcdce8c4SStefano Zampini   }
2465fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2466fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2467fcdce8c4SStefano Zampini   /* precompute flops count */
2468fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2469fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2470fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2471fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2472fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2473fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2474fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2475fcdce8c4SStefano Zampini       }
2476fcdce8c4SStefano Zampini     }
2477fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2478fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2479fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2480fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2481fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2482fcdce8c4SStefano Zampini     }
2483fcdce8c4SStefano Zampini   } else { /* TODO */
2484fcdce8c4SStefano Zampini     flops = 0.;
2485fcdce8c4SStefano Zampini   }
2486fcdce8c4SStefano Zampini 
2487fcdce8c4SStefano Zampini   mmdata->flops = flops;
2488fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2489fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2490fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2491fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2492fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2493fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2494fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2495fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2496fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2497fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2498fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2499fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2500fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2501bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2502fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2503fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2504fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2505fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2506fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2507fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2508fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2509fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2510fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2511fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2512fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2513fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2514fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2515fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2516fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2517bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2518fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2519fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2520fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2521fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2522fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2523fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2524fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2525fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
2526fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2527fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2528fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2529fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2530fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2531fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2532fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2535fcdce8c4SStefano Zampini #else
2536fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2537fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2538fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2539fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2540fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2541fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2542fcdce8c4SStefano Zampini   c->nz = cnz;
2543fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2544fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2545fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2546fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2547fcdce8c4SStefano Zampini 
2548fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2549fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2550fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2551fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2552fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2553fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2554fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2555fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2556fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2557fcdce8c4SStefano Zampini #endif
2558fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2559fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2560fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2561fcdce8c4SStefano Zampini finalizesym:
2562fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2563fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2564fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2565fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2566fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2567fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2568fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2569fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2570fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2571fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2572fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2573fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2574fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2575fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2576fcdce8c4SStefano Zampini   } else {
2577fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2578fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2579fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2580fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2581fcdce8c4SStefano Zampini   }
2582fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2583fcdce8c4SStefano Zampini     PetscInt r = 0;
2584fcdce8c4SStefano Zampini     c->i[0] = 0;
2585fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2586fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2587fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2588fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2589fcdce8c4SStefano Zampini     }
2590fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2591fcdce8c4SStefano Zampini   }
2592fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2593fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2594fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2595fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2596fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2597fcdce8c4SStefano Zampini   c->rmax = 0;
2598fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2599fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2600fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2601fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2602fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2603fcdce8c4SStefano Zampini   }
2604fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2605fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2606fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2607fcdce8c4SStefano Zampini 
2608fcdce8c4SStefano Zampini   C->nonzerostate++;
2609fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2610fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2611fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2612fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2613fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2614fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2615fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2616abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2617fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2618fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2619fcdce8c4SStefano Zampini   }
2620fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2621fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2622fcdce8c4SStefano Zampini }
2623fcdce8c4SStefano Zampini 
2624fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2625fcdce8c4SStefano Zampini 
2626fcdce8c4SStefano Zampini /* handles sparse or dense B */
2627fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2628fcdce8c4SStefano Zampini {
2629fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2630fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2631fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2632fcdce8c4SStefano Zampini 
2633fcdce8c4SStefano Zampini   PetscFunctionBegin;
2634fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2635fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2636abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2637fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2638fcdce8c4SStefano Zampini   }
2639fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2640fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2641fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2642fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2643fcdce8c4SStefano Zampini     }
2644fcdce8c4SStefano Zampini   }
2645fcdce8c4SStefano Zampini   if (isdense) {
2646ccdfe979SStefano Zampini     switch (product->type) {
2647ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2648ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2649ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2650ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2651ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2652fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2653fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2654fcdce8c4SStefano Zampini       } else {
2655fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2656fcdce8c4SStefano Zampini       }
2657fcdce8c4SStefano Zampini       break;
2658fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2659fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2660fcdce8c4SStefano Zampini       break;
2661ccdfe979SStefano Zampini     default:
2662ccdfe979SStefano Zampini       break;
2663ccdfe979SStefano Zampini     }
2664fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2665fcdce8c4SStefano Zampini     switch (product->type) {
2666fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2667fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2668fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2669fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2670fcdce8c4SStefano Zampini       break;
2671fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2672fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2673fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2674fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2675fcdce8c4SStefano Zampini       break;
2676fcdce8c4SStefano Zampini     default:
2677fcdce8c4SStefano Zampini       break;
2678fcdce8c4SStefano Zampini     }
2679fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2680fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2681fcdce8c4SStefano Zampini   }
2682ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2683ccdfe979SStefano Zampini }
2684ccdfe979SStefano Zampini 
26856fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
26869ae82921SPaul Mullowney {
2687b175d8bbSPaul Mullowney   PetscErrorCode ierr;
26889ae82921SPaul Mullowney 
26899ae82921SPaul Mullowney   PetscFunctionBegin;
2690e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2691e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2692e6e9a74fSStefano Zampini }
2693e6e9a74fSStefano Zampini 
2694e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2695e6e9a74fSStefano Zampini {
2696e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2697e6e9a74fSStefano Zampini 
2698e6e9a74fSStefano Zampini   PetscFunctionBegin;
2699e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2700e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2701e6e9a74fSStefano Zampini }
2702e6e9a74fSStefano Zampini 
2703e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2704e6e9a74fSStefano Zampini {
2705e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2706e6e9a74fSStefano Zampini 
2707e6e9a74fSStefano Zampini   PetscFunctionBegin;
2708e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2709e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2710e6e9a74fSStefano Zampini }
2711e6e9a74fSStefano Zampini 
2712e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2713e6e9a74fSStefano Zampini {
2714e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2715e6e9a74fSStefano Zampini 
2716e6e9a74fSStefano Zampini   PetscFunctionBegin;
2717e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27189ae82921SPaul Mullowney   PetscFunctionReturn(0);
27199ae82921SPaul Mullowney }
27209ae82921SPaul Mullowney 
27216fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2722ca45077fSPaul Mullowney {
2723b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2724ca45077fSPaul Mullowney 
2725ca45077fSPaul Mullowney   PetscFunctionBegin;
2726e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2727ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2728ca45077fSPaul Mullowney }
2729ca45077fSPaul Mullowney 
2730afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2731e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27329ae82921SPaul Mullowney {
27339ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2734aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27359ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2736e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2737b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
273857d48284SJunchao Zhang   cudaError_t                  cerr;
2739aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2740e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2741e6e9a74fSStefano Zampini   PetscBool                    compressed;
2742afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2743afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2744afb2bd1cSJunchao Zhang #endif
27456e111a19SKarl Rupp 
27469ae82921SPaul Mullowney   PetscFunctionBegin;
2747e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2748e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2749afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2750d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2751e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2752e6e9a74fSStefano Zampini   }
275334d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
275434d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2755e6e9a74fSStefano Zampini   if (!trans) {
27569ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2757c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2758e6e9a74fSStefano Zampini   } else {
2759e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2760e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2761e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2762e6e9a74fSStefano Zampini     } else {
2763afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2764e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2765e6e9a74fSStefano Zampini     }
2766e6e9a74fSStefano Zampini   }
2767e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2768e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2769213423ffSJunchao Zhang 
2770e6e9a74fSStefano Zampini   try {
2771e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2772213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2773213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2774afb2bd1cSJunchao Zhang 
277585ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2776e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2777afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2778afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2779afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2780afb2bd1cSJunchao Zhang       */
2781e6e9a74fSStefano Zampini       xptr = xarray;
2782afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2783213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2784afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2785afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2786afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2787afb2bd1cSJunchao Zhang        */
2788afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2789afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2790afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2791afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2792afb2bd1cSJunchao Zhang       }
2793afb2bd1cSJunchao Zhang      #endif
2794e6e9a74fSStefano Zampini     } else {
2795afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2796afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2797afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2798afb2bd1cSJunchao Zhang        */
2799afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2800e6e9a74fSStefano Zampini       dptr = zarray;
2801e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2802afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2803e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2804e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2805e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2806e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2807e6e9a74fSStefano Zampini       }
2808afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2809afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2810afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2811afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2812afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2813afb2bd1cSJunchao Zhang       }
2814afb2bd1cSJunchao Zhang      #endif
2815e6e9a74fSStefano Zampini     }
28169ae82921SPaul Mullowney 
2817afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2818aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2819afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2820afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2821afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2822afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2823afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2824afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2825afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2826afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2827afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2828afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2829afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2830afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2831afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2832afb2bd1cSJunchao Zhang 
2833afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2834afb2bd1cSJunchao Zhang       } else {
2835afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2836afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2837afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2838afb2bd1cSJunchao Zhang       }
2839afb2bd1cSJunchao Zhang 
2840afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2841afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2842afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2843afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2844afb2bd1cSJunchao Zhang                                beta,
2845afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2846afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2847afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2848afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2849afb2bd1cSJunchao Zhang      #else
28507656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2851e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2852a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2853afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2854aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2855e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
285657d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2857afb2bd1cSJunchao Zhang      #endif
2858aa372e3fSPaul Mullowney     } else {
2859213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2860afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2861afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2862afb2bd1cSJunchao Zhang        #else
2863301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2864e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2865afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2866e6e9a74fSStefano Zampini                                  xptr, beta,
286757d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2868afb2bd1cSJunchao Zhang        #endif
2869a65300a6SPaul Mullowney       }
2870aa372e3fSPaul Mullowney     }
287105035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2872958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2873aa372e3fSPaul Mullowney 
2874e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2875213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2876213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2877213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2878e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2879213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
28807656d835SStefano Zampini         }
2881213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2882c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
28837656d835SStefano Zampini       }
28847656d835SStefano Zampini 
2885213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2886213423ffSJunchao Zhang       if (compressed) {
2887213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2888e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2889c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2890e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2891c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
289205035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2893958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2894e6e9a74fSStefano Zampini       }
2895e6e9a74fSStefano Zampini     } else {
2896e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2897e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2898e6e9a74fSStefano Zampini       }
2899e6e9a74fSStefano Zampini     }
2900e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2901213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2902213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29039ae82921SPaul Mullowney   } catch(char *ex) {
29049ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29059ae82921SPaul Mullowney   }
2906e6e9a74fSStefano Zampini   if (yy) {
2907958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2908e6e9a74fSStefano Zampini   } else {
2909e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2910e6e9a74fSStefano Zampini   }
29119ae82921SPaul Mullowney   PetscFunctionReturn(0);
29129ae82921SPaul Mullowney }
29139ae82921SPaul Mullowney 
29146fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2915ca45077fSPaul Mullowney {
2916b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29176e111a19SKarl Rupp 
2918ca45077fSPaul Mullowney   PetscFunctionBegin;
2919e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2920ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2921ca45077fSPaul Mullowney }
2922ca45077fSPaul Mullowney 
29236fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29249ae82921SPaul Mullowney {
29259ae82921SPaul Mullowney   PetscErrorCode              ierr;
2926a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29279ae82921SPaul Mullowney   PetscFunctionBegin;
2928bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29293fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2930bc3f50f2SPaul Mullowney   }
29313fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29323fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2933a587d139SMark   if (d_mat) {
29343fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29353fa6b06aSMark Adams   }
29363fa6b06aSMark Adams 
29379ae82921SPaul Mullowney   PetscFunctionReturn(0);
29389ae82921SPaul Mullowney }
29399ae82921SPaul Mullowney 
29409ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2941e057df02SPaul Mullowney /*@
29429ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2943e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2944e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2945e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2946e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2947e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29489ae82921SPaul Mullowney 
2949d083f849SBarry Smith    Collective
29509ae82921SPaul Mullowney 
29519ae82921SPaul Mullowney    Input Parameters:
29529ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29539ae82921SPaul Mullowney .  m - number of rows
29549ae82921SPaul Mullowney .  n - number of columns
29559ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29569ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29570298fd71SBarry Smith          (possibly different for each row) or NULL
29589ae82921SPaul Mullowney 
29599ae82921SPaul Mullowney    Output Parameter:
29609ae82921SPaul Mullowney .  A - the matrix
29619ae82921SPaul Mullowney 
29629ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
29639ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
29649ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
29659ae82921SPaul Mullowney 
29669ae82921SPaul Mullowney    Notes:
29679ae82921SPaul Mullowney    If nnz is given then nz is ignored
29689ae82921SPaul Mullowney 
29699ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
29709ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
29719ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
29729ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
29739ae82921SPaul Mullowney 
29749ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
29750298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
29769ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
29779ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
29789ae82921SPaul Mullowney 
29799ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
29809ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
29819ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
29829ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
29839ae82921SPaul Mullowney 
29849ae82921SPaul Mullowney    Level: intermediate
29859ae82921SPaul Mullowney 
2986e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
29879ae82921SPaul Mullowney @*/
29889ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
29899ae82921SPaul Mullowney {
29909ae82921SPaul Mullowney   PetscErrorCode ierr;
29919ae82921SPaul Mullowney 
29929ae82921SPaul Mullowney   PetscFunctionBegin;
29939ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
29949ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
29959ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
29969ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
29979ae82921SPaul Mullowney   PetscFunctionReturn(0);
29989ae82921SPaul Mullowney }
29999ae82921SPaul Mullowney 
30006fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30019ae82921SPaul Mullowney {
30029ae82921SPaul Mullowney   PetscErrorCode              ierr;
30033fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3004ab25e6cbSDominic Meiser 
30059ae82921SPaul Mullowney   PetscFunctionBegin;
30069ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30073fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30083fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3009470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30109ae82921SPaul Mullowney   } else {
3011470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3012aa372e3fSPaul Mullowney   }
30133fa6b06aSMark Adams   if (d_mat) {
30143fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30153fa6b06aSMark Adams     cudaError_t                err;
30163fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30173fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30183fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30193fa6b06aSMark Adams     if (a->compressedrow.use) {
30203fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30213fa6b06aSMark Adams     }
30223fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30233fa6b06aSMark Adams   }
3024*c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3025ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3026ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3027ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3028fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3029ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30307e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30317e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30329ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30339ae82921SPaul Mullowney   PetscFunctionReturn(0);
30349ae82921SPaul Mullowney }
30359ae82921SPaul Mullowney 
3036ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
303795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30389ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30399ff858a8SKarl Rupp {
30409ff858a8SKarl Rupp   PetscErrorCode ierr;
30419ff858a8SKarl Rupp 
30429ff858a8SKarl Rupp   PetscFunctionBegin;
30439ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3044ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30459ff858a8SKarl Rupp   PetscFunctionReturn(0);
30469ff858a8SKarl Rupp }
30479ff858a8SKarl Rupp 
3048039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
304995639643SRichard Tran Mills {
3050e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3051a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3052039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3053039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3054039c6fbaSStefano Zampini   PetscScalar        *ay;
3055039c6fbaSStefano Zampini   const PetscScalar  *ax;
3056039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3057039c6fbaSStefano Zampini   cudaError_t        cerr;
3058e6e9a74fSStefano Zampini 
305995639643SRichard Tran Mills   PetscFunctionBegin;
3060039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3061a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3062a587d139SMark     PetscFunctionReturn(0);
306395639643SRichard Tran Mills   }
3064039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3065a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3066a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3067039c6fbaSStefano Zampini   cy   = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3068039c6fbaSStefano Zampini   cx   = (Mat_SeqAIJCUSPARSE*)X->spptr;
3069039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3070039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3071039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3072039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3073039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3074039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3075039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3076039c6fbaSStefano Zampini     if (eq) {
3077039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3078039c6fbaSStefano Zampini     }
3079039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3080039c6fbaSStefano Zampini   }
3081039c6fbaSStefano Zampini 
3082039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3083039c6fbaSStefano Zampini     cusparseStatus_t stat;
3084039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3085039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3086039c6fbaSStefano Zampini     size_t           bufferSize;
3087039c6fbaSStefano Zampini     void             *buffer;
3088039c6fbaSStefano Zampini #endif
3089039c6fbaSStefano Zampini 
3090039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3091039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3092039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3093039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3094039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3095039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3096039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3097039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3098039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3099039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3100039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3101039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3102039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3103039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3104039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3105039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3106039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3107039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3108039c6fbaSStefano Zampini #else
3109039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3110039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3111039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3112039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3113039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3114039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3115039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3116039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3117039c6fbaSStefano Zampini #endif
3118039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3119039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3120039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3121039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3122039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3123a587d139SMark     cublasHandle_t cublasv2handle;
3124039c6fbaSStefano Zampini     cublasStatus_t berr;
3125a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3126039c6fbaSStefano Zampini 
3127039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3128039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3129a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3130a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3131a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3132039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3133039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3134a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3135a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3136039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3137039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3138a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3139039c6fbaSStefano Zampini   } else {
3140039c6fbaSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,DIFFERENT_NONZERO_PATTERN);CHKERRQ(ierr);
3141a587d139SMark   }
314295639643SRichard Tran Mills   PetscFunctionReturn(0);
314395639643SRichard Tran Mills }
314495639643SRichard Tran Mills 
31453fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
31463fa6b06aSMark Adams {
31473fa6b06aSMark Adams   PetscErrorCode             ierr;
31487e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3149a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
31507e8381f9SStefano Zampini 
31513fa6b06aSMark Adams   PetscFunctionBegin;
31523fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
31533fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
31547e8381f9SStefano Zampini     if (spptr->mat) {
31557e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
31567e8381f9SStefano Zampini       if (matrix->values) {
31577e8381f9SStefano Zampini         both = PETSC_TRUE;
31587e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31597e8381f9SStefano Zampini       }
31607e8381f9SStefano Zampini     }
31617e8381f9SStefano Zampini     if (spptr->matTranspose) {
31627e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
31637e8381f9SStefano Zampini       if (matrix->values) {
31647e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31657e8381f9SStefano Zampini       }
31667e8381f9SStefano Zampini     }
31673fa6b06aSMark Adams   }
3168a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3169a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3170a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
31717e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3172a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
31733fa6b06aSMark Adams 
31743fa6b06aSMark Adams   PetscFunctionReturn(0);
31753fa6b06aSMark Adams }
31763fa6b06aSMark Adams 
3177a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3178a587d139SMark {
3179a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3180a587d139SMark   PetscErrorCode ierr;
3181a587d139SMark 
3182a587d139SMark   PetscFunctionBegin;
3183a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3184a587d139SMark   if (flg) {
3185a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3186a587d139SMark 
3187a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3188a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3189a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3190a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3191a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3192a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3193a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3194a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3195fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3196*c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3197a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3198a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3199a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3200a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3201a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3202fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3203a587d139SMark   } else {
3204a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3205a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3206a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3207a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3208a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3209a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3210a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3211a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3212fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3213*c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3214a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3215a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3216a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3217a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3218a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3219fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3220a587d139SMark   }
3221a587d139SMark   A->boundtocpu = flg;
3222a587d139SMark   a->inode.use = flg;
3223a587d139SMark   PetscFunctionReturn(0);
3224a587d139SMark }
3225a587d139SMark 
322649735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32279ae82921SPaul Mullowney {
32289ae82921SPaul Mullowney   PetscErrorCode   ierr;
3229aa372e3fSPaul Mullowney   cusparseStatus_t stat;
323049735bf3SStefano Zampini   Mat              B;
32319ae82921SPaul Mullowney 
32329ae82921SPaul Mullowney   PetscFunctionBegin;
3233832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
323449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
323549735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
323649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
323749735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
323849735bf3SStefano Zampini   }
323949735bf3SStefano Zampini   B = *newmat;
324049735bf3SStefano Zampini 
324134136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
324234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
324334136279SStefano Zampini 
324449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
32459ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3246e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3247e6e9a74fSStefano Zampini 
3248e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3249e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
3250e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3251e6e9a74fSStefano Zampini       B->spptr = spptr;
32523fa6b06aSMark Adams       spptr->deviceMat = NULL;
32539ae82921SPaul Mullowney     } else {
3254e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3255e6e9a74fSStefano Zampini 
3256e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3257e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3258e6e9a74fSStefano Zampini       B->spptr = spptr;
32599ae82921SPaul Mullowney     }
3260e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
326149735bf3SStefano Zampini   }
3262693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
32639ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
32649ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
326595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3266693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
32672205254eSKarl Rupp 
3268e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
32699ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3270bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
32719ae82921SPaul Mullowney   PetscFunctionReturn(0);
32729ae82921SPaul Mullowney }
32739ae82921SPaul Mullowney 
327402fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
327502fe1965SBarry Smith {
327602fe1965SBarry Smith   PetscErrorCode ierr;
327702fe1965SBarry Smith 
327802fe1965SBarry Smith   PetscFunctionBegin;
327902fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
32800ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3281afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3282afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3283afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
328402fe1965SBarry Smith   PetscFunctionReturn(0);
328502fe1965SBarry Smith }
328602fe1965SBarry Smith 
32873ca39a21SBarry Smith /*MC
3288e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3289e057df02SPaul Mullowney 
3290e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
32912692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
32922692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3293e057df02SPaul Mullowney 
3294e057df02SPaul Mullowney    Options Database Keys:
3295e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3296aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3297a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3298e057df02SPaul Mullowney 
3299e057df02SPaul Mullowney   Level: beginner
3300e057df02SPaul Mullowney 
33018468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3302e057df02SPaul Mullowney M*/
33037f756511SDominic Meiser 
330442c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
330542c9c57cSBarry Smith 
33060f39cd5aSBarry Smith 
33073ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
330842c9c57cSBarry Smith {
330942c9c57cSBarry Smith   PetscErrorCode ierr;
331042c9c57cSBarry Smith 
331142c9c57cSBarry Smith   PetscFunctionBegin;
33123ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33133ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33143ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33153ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
331642c9c57cSBarry Smith   PetscFunctionReturn(0);
331742c9c57cSBarry Smith }
331829b38603SBarry Smith 
3319470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33207f756511SDominic Meiser {
3321e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33227f756511SDominic Meiser   cusparseStatus_t stat;
33237f756511SDominic Meiser 
33247f756511SDominic Meiser   PetscFunctionBegin;
33257f756511SDominic Meiser   if (*cusparsestruct) {
3326e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3327e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
33287f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
332981902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
33307e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
33317e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
33327e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3333afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3334afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
3335afb2bd1cSJunchao Zhang    #endif
3336e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
33377f756511SDominic Meiser   }
33387f756511SDominic Meiser   PetscFunctionReturn(0);
33397f756511SDominic Meiser }
33407f756511SDominic Meiser 
33417f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
33427f756511SDominic Meiser {
33437f756511SDominic Meiser   PetscFunctionBegin;
33447f756511SDominic Meiser   if (*mat) {
33457f756511SDominic Meiser     delete (*mat)->values;
33467f756511SDominic Meiser     delete (*mat)->column_indices;
33477f756511SDominic Meiser     delete (*mat)->row_offsets;
33487f756511SDominic Meiser     delete *mat;
33497f756511SDominic Meiser     *mat = 0;
33507f756511SDominic Meiser   }
33517f756511SDominic Meiser   PetscFunctionReturn(0);
33527f756511SDominic Meiser }
33537f756511SDominic Meiser 
3354470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
33557f756511SDominic Meiser {
33567f756511SDominic Meiser   cusparseStatus_t stat;
33577f756511SDominic Meiser   PetscErrorCode   ierr;
33587f756511SDominic Meiser 
33597f756511SDominic Meiser   PetscFunctionBegin;
33607f756511SDominic Meiser   if (*trifactor) {
336157d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3362afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
33637f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
33641b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
33652cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3366afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
33671b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3368afb2bd1cSJunchao Zhang    #endif
3369da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
33707f756511SDominic Meiser   }
33717f756511SDominic Meiser   PetscFunctionReturn(0);
33727f756511SDominic Meiser }
33737f756511SDominic Meiser 
3374470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
33757f756511SDominic Meiser {
33767f756511SDominic Meiser   CsrMatrix        *mat;
33777f756511SDominic Meiser   cusparseStatus_t stat;
33787f756511SDominic Meiser   cudaError_t      err;
33797f756511SDominic Meiser 
33807f756511SDominic Meiser   PetscFunctionBegin;
33817f756511SDominic Meiser   if (*matstruct) {
33827f756511SDominic Meiser     if ((*matstruct)->mat) {
33837f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3384afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3385afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3386afb2bd1cSJunchao Zhang        #else
33877f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
338857d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3389afb2bd1cSJunchao Zhang        #endif
33907f756511SDominic Meiser       } else {
33917f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
33927f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
33937f756511SDominic Meiser       }
33947f756511SDominic Meiser     }
339557d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
33967f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3397afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
33987656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
33997656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3400afb2bd1cSJunchao Zhang 
3401afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3402afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3403afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3404afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3405afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3406afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3407afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3408afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3409afb2bd1cSJunchao Zhang       }
3410afb2bd1cSJunchao Zhang     }
3411afb2bd1cSJunchao Zhang    #endif
34127f756511SDominic Meiser     delete *matstruct;
34137e8381f9SStefano Zampini     *matstruct = NULL;
34147f756511SDominic Meiser   }
34157f756511SDominic Meiser   PetscFunctionReturn(0);
34167f756511SDominic Meiser }
34177f756511SDominic Meiser 
3418ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
34197f756511SDominic Meiser {
3420e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3421e6e9a74fSStefano Zampini 
34227f756511SDominic Meiser   PetscFunctionBegin;
34237f756511SDominic Meiser   if (*trifactors) {
3424e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3425e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3426e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3427e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
34287f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
34297f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
34307f756511SDominic Meiser     delete (*trifactors)->workVector;
34317e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
34327e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
34337e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3434ccdfe979SStefano Zampini   }
3435ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3436ccdfe979SStefano Zampini }
3437ccdfe979SStefano Zampini 
3438ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3439ccdfe979SStefano Zampini {
3440e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3441ccdfe979SStefano Zampini   cusparseHandle_t handle;
3442ccdfe979SStefano Zampini   cusparseStatus_t stat;
3443ccdfe979SStefano Zampini 
3444ccdfe979SStefano Zampini   PetscFunctionBegin;
3445ccdfe979SStefano Zampini   if (*trifactors) {
3446e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
34477f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
344857d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
34497f756511SDominic Meiser     }
3450e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
34517f756511SDominic Meiser   }
34527f756511SDominic Meiser   PetscFunctionReturn(0);
34537f756511SDominic Meiser }
34547e8381f9SStefano Zampini 
34557e8381f9SStefano Zampini struct IJCompare
34567e8381f9SStefano Zampini {
34577e8381f9SStefano Zampini   __host__ __device__
34587e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
34597e8381f9SStefano Zampini   {
34607e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
34617e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
34627e8381f9SStefano Zampini     return false;
34637e8381f9SStefano Zampini   }
34647e8381f9SStefano Zampini };
34657e8381f9SStefano Zampini 
34667e8381f9SStefano Zampini struct IJEqual
34677e8381f9SStefano Zampini {
34687e8381f9SStefano Zampini   __host__ __device__
34697e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
34707e8381f9SStefano Zampini   {
34717e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
34727e8381f9SStefano Zampini     return true;
34737e8381f9SStefano Zampini   }
34747e8381f9SStefano Zampini };
34757e8381f9SStefano Zampini 
34767e8381f9SStefano Zampini struct IJDiff
34777e8381f9SStefano Zampini {
34787e8381f9SStefano Zampini   __host__ __device__
34797e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
34807e8381f9SStefano Zampini   {
34817e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
34827e8381f9SStefano Zampini   }
34837e8381f9SStefano Zampini };
34847e8381f9SStefano Zampini 
34857e8381f9SStefano Zampini struct IJSum
34867e8381f9SStefano Zampini {
34877e8381f9SStefano Zampini   __host__ __device__
34887e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
34897e8381f9SStefano Zampini   {
34907e8381f9SStefano Zampini     return t1||t2;
34917e8381f9SStefano Zampini   }
34927e8381f9SStefano Zampini };
34937e8381f9SStefano Zampini 
34947e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3495e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
34967e8381f9SStefano Zampini {
34977e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3498fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3499bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
350008391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35017e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35027e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35037e8381f9SStefano Zampini   cudaError_t                           cerr;
35047e8381f9SStefano Zampini   PetscInt                              n;
35057e8381f9SStefano Zampini 
35067e8381f9SStefano Zampini   PetscFunctionBegin;
35077e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35087e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35097e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35107e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35117e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35127e8381f9SStefano Zampini     PetscFunctionReturn(0);
35137e8381f9SStefano Zampini   }
35147e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35157e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3516e61fc153SStefano Zampini   if (!v) {
3517e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3518e61fc153SStefano Zampini     goto finalize;
35197e8381f9SStefano Zampini   }
3520e61fc153SStefano Zampini   n = cusp->cooPerm->size();
352108391a17SStefano Zampini   if (isCudaMem(v)) {
352208391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
352308391a17SStefano Zampini   } else {
3524e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3525e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
352608391a17SStefano Zampini     d_v = cooPerm_v->data();
3527e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
352808391a17SStefano Zampini   }
3529bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3530e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
35317e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3532bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
353308391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3534e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3535e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3536e61fc153SStefano Zampini       delete cooPerm_w;
35377e8381f9SStefano Zampini     } else {
353808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
35397e8381f9SStefano Zampini                                                                 matrix->values->begin()));
354008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
35417e8381f9SStefano Zampini                                                                 matrix->values->end()));
35427e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
35437e8381f9SStefano Zampini     }
35447e8381f9SStefano Zampini   } else {
3545e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
354608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3547e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
35487e8381f9SStefano Zampini     } else {
354908391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
35507e8381f9SStefano Zampini                                                                 matrix->values->begin()));
355108391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
35527e8381f9SStefano Zampini                                                                 matrix->values->end()));
35537e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
35547e8381f9SStefano Zampini     }
35557e8381f9SStefano Zampini   }
35567e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3557bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3558e61fc153SStefano Zampini finalize:
3559e61fc153SStefano Zampini   delete cooPerm_v;
35607e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3561e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3562fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3563fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3564fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3565fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3566fcdce8c4SStefano Zampini   a->reallocs         = 0;
3567fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3568fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3569fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3570fcdce8c4SStefano Zampini   A->num_ass++;
35717e8381f9SStefano Zampini   PetscFunctionReturn(0);
35727e8381f9SStefano Zampini }
35737e8381f9SStefano Zampini 
35747e8381f9SStefano Zampini #include <thrust/binary_search.h>
3575e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
35767e8381f9SStefano Zampini {
35777e8381f9SStefano Zampini   PetscErrorCode     ierr;
35787e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
35797e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
35807e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
35817e8381f9SStefano Zampini   cudaError_t        cerr;
35827e8381f9SStefano Zampini 
35837e8381f9SStefano Zampini   PetscFunctionBegin;
35847e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
35857e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
35867e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
35877e8381f9SStefano Zampini   if (n != cooPerm_n) {
35887e8381f9SStefano Zampini     delete cusp->cooPerm;
35897e8381f9SStefano Zampini     delete cusp->cooPerm_a;
35907e8381f9SStefano Zampini     cusp->cooPerm = NULL;
35917e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
35927e8381f9SStefano Zampini   }
35937e8381f9SStefano Zampini   if (n) {
35947e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
35957e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
35967e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
35977e8381f9SStefano Zampini 
35987e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
35997e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
36007e8381f9SStefano Zampini 
36017e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
36027e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
36037e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
36047e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36057e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36067e8381f9SStefano Zampini 
360708391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36087e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
36097e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
36107e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
36117e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
36127e8381f9SStefano Zampini 
36137e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
36147e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
36157e8381f9SStefano Zampini       delete cusp->cooPerm_a;
36167e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
36177e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
36187e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
36197e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
36207e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
36217e8381f9SStefano Zampini       w[0] = 0;
36227e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
36237e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
36247e8381f9SStefano Zampini     }
36257e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
36267e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
36277e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
36287e8381f9SStefano Zampini                         ii.begin());
362908391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
363008391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
36317e8381f9SStefano Zampini 
36327e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
36337e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
36347e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
36357e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
36367e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
36377e8381f9SStefano Zampini     a->i[0] = 0;
36387e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36397e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3640fcdce8c4SStefano Zampini     a->rmax = 0;
36417e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
36427e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
36437e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
36447e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
36457e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
36467e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
36477e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
36487e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
36497e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3650fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
36517e8381f9SStefano Zampini     }
3652fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
36537e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
36547e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3655fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
36567e8381f9SStefano Zampini   } else {
36577e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
36587e8381f9SStefano Zampini   }
3659e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
36607e8381f9SStefano Zampini 
36617e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3662e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3663e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
36647e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
36657e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
36667e8381f9SStefano Zampini   A->nonzerostate++;
36677e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
36687e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
36697e8381f9SStefano Zampini 
36707e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
36717e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
36727e8381f9SStefano Zampini   PetscFunctionReturn(0);
36737e8381f9SStefano Zampini }
3674ed502f03SStefano Zampini 
3675ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3676ed502f03SStefano Zampini {
3677ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3678ed502f03SStefano Zampini   CsrMatrix          *csr;
3679ed502f03SStefano Zampini   PetscErrorCode     ierr;
3680ed502f03SStefano Zampini 
3681ed502f03SStefano Zampini   PetscFunctionBegin;
3682ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3683ed502f03SStefano Zampini   PetscValidPointer(a,2);
3684ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3685ed502f03SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3686ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3687ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3688ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3689ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3690ed502f03SStefano Zampini   *a = csr->values->data().get();
3691ed502f03SStefano Zampini   PetscFunctionReturn(0);
3692ed502f03SStefano Zampini }
3693ed502f03SStefano Zampini 
3694ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3695ed502f03SStefano Zampini {
3696ed502f03SStefano Zampini   PetscFunctionBegin;
3697ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3698ed502f03SStefano Zampini   PetscValidPointer(a,2);
3699ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3700ed502f03SStefano Zampini   *a = NULL;
3701ed502f03SStefano Zampini   PetscFunctionReturn(0);
3702ed502f03SStefano Zampini }
3703ed502f03SStefano Zampini 
3704039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3705039c6fbaSStefano Zampini {
3706039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3707039c6fbaSStefano Zampini   CsrMatrix          *csr;
3708039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3709039c6fbaSStefano Zampini 
3710039c6fbaSStefano Zampini   PetscFunctionBegin;
3711039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3712039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3713039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3714039c6fbaSStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3715039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3716039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3717039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3718039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3719039c6fbaSStefano Zampini   *a = csr->values->data().get();
3720039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3721039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3722039c6fbaSStefano Zampini }
3723039c6fbaSStefano Zampini 
3724039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3725039c6fbaSStefano Zampini {
3726039c6fbaSStefano Zampini   PetscErrorCode ierr;
3727039c6fbaSStefano Zampini 
3728039c6fbaSStefano Zampini   PetscFunctionBegin;
3729039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3730039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3731039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3732039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3733039c6fbaSStefano Zampini   *a = NULL;
3734039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3735039c6fbaSStefano Zampini }
3736039c6fbaSStefano Zampini 
3737ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3738ed502f03SStefano Zampini {
3739ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3740ed502f03SStefano Zampini   CsrMatrix          *csr;
3741ed502f03SStefano Zampini 
3742ed502f03SStefano Zampini   PetscFunctionBegin;
3743ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3744ed502f03SStefano Zampini   PetscValidPointer(a,2);
3745ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3746ed502f03SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3747ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3748ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3749ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3750ed502f03SStefano Zampini   *a = csr->values->data().get();
3751039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3752ed502f03SStefano Zampini   PetscFunctionReturn(0);
3753ed502f03SStefano Zampini }
3754ed502f03SStefano Zampini 
3755ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3756ed502f03SStefano Zampini {
3757ed502f03SStefano Zampini   PetscErrorCode ierr;
3758ed502f03SStefano Zampini 
3759ed502f03SStefano Zampini   PetscFunctionBegin;
3760ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3761ed502f03SStefano Zampini   PetscValidPointer(a,2);
3762ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3763ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3764ed502f03SStefano Zampini   *a = NULL;
3765ed502f03SStefano Zampini   PetscFunctionReturn(0);
3766ed502f03SStefano Zampini }
3767ed502f03SStefano Zampini 
3768ed502f03SStefano Zampini struct IJCompare4
3769ed502f03SStefano Zampini {
3770ed502f03SStefano Zampini   __host__ __device__
37712ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3772ed502f03SStefano Zampini   {
3773ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3774ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3775ed502f03SStefano Zampini     return false;
3776ed502f03SStefano Zampini   }
3777ed502f03SStefano Zampini };
3778ed502f03SStefano Zampini 
37798909a122SStefano Zampini struct Shift
37808909a122SStefano Zampini {
3781ed502f03SStefano Zampini   int _shift;
3782ed502f03SStefano Zampini 
3783ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3784ed502f03SStefano Zampini   __host__ __device__
3785ed502f03SStefano Zampini   inline int operator() (const int &c)
3786ed502f03SStefano Zampini   {
3787ed502f03SStefano Zampini     return c + _shift;
3788ed502f03SStefano Zampini   }
3789ed502f03SStefano Zampini };
3790ed502f03SStefano Zampini 
3791ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3792ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3793ed502f03SStefano Zampini {
3794ed502f03SStefano Zampini   PetscErrorCode               ierr;
3795ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3796ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3797ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3798ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3799ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3800ed502f03SStefano Zampini   cusparseStatus_t             stat;
3801ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3802ed502f03SStefano Zampini   cudaError_t                  cerr;
3803ed502f03SStefano Zampini 
3804ed502f03SStefano Zampini   PetscFunctionBegin;
3805ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3806ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3807ed502f03SStefano Zampini   PetscValidPointer(C,4);
3808ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3809ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3810ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3811ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3812ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3813ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3814ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3815ed502f03SStefano Zampini     m     = A->rmap->n;
3816ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3817ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3818ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3819ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3820ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3821ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3822ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3823ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3824ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3825ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3826ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3827ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3828ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3829ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3830ed502f03SStefano Zampini     Ccusp->nrows    = m;
3831ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3832ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3833ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3834ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3835ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3836ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3837ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3838ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3839ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3840ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3841ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3842ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3843ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3844ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3845ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3846ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3847ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3848ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3849ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3850ed502f03SStefano Zampini 
3851ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3852ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3853ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3854ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3855ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3856ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3857ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3858ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3859ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3860ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3861ed502f03SStefano Zampini     if (c->nz) {
38622ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
38632ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
38642ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
38652ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
38662ed87e7eSStefano Zampini 
3867ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3868ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3869ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3870ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3871ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3872ed502f03SStefano Zampini         }
38732ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
38742ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3875ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3876ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3877ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3878ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3879ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3880ed502f03SStefano Zampini         }
38812ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
38822ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3883ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
38842ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
38852ed87e7eSStefano Zampini                               Aroff->data().get(),
38862ed87e7eSStefano Zampini                               Annz,
38872ed87e7eSStefano Zampini                               m,
38882ed87e7eSStefano Zampini                               Acoo->data().get(),
38892ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3890ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
38912ed87e7eSStefano Zampini                               Broff->data().get(),
3892ed502f03SStefano Zampini                               Bnnz,
3893ed502f03SStefano Zampini                               m,
38942ed87e7eSStefano Zampini                               Bcoo->data().get(),
3895ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
38962ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
38972ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
38982ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
38998909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3900ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3901ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
39028909a122SStefano Zampini #else
39038909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
39048909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39058909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
39068909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
39078909a122SStefano Zampini #endif
39082ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
39092ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
39102ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
39112ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
39122ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
39132ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3914ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
3915ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
3916ed502f03SStefano Zampini       thrust::advance(p2,Annz);
39172ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
39188909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
39198909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
39208909a122SStefano Zampini #endif
39212ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
39222ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
39232ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
39242ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
39252ed87e7eSStefano Zampini #else
39262ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
39272ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
39282ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
39292ed87e7eSStefano Zampini #endif
3930ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
39312ed87e7eSStefano Zampini                               Ccoo->data().get(),
3932ed502f03SStefano Zampini                               c->nz,
3933ed502f03SStefano Zampini                               m,
3934ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
3935ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3936ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
3937ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
39382ed87e7eSStefano Zampini       delete wPerm;
39392ed87e7eSStefano Zampini       delete Acoo;
39402ed87e7eSStefano Zampini       delete Bcoo;
39412ed87e7eSStefano Zampini       delete Ccoo;
3942ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3943ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
3944ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
3945ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3946ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
3947ed502f03SStefano Zampini #endif
3948ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
3949ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
3950ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
3951ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
3952ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
3953ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
3954ed502f03SStefano Zampini 
3955ed502f03SStefano Zampini         Ccusp->transgen = PETSC_TRUE;
3956ed502f03SStefano Zampini         CmatT->cprowIndices  = NULL;
3957ed502f03SStefano Zampini         CmatT->mat = CcsrT;
3958ed502f03SStefano Zampini         CcsrT->num_rows = n;
3959ed502f03SStefano Zampini         CcsrT->num_cols = m;
3960ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
3961ed502f03SStefano Zampini 
3962ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
3963ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
3964ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
3965ed502f03SStefano Zampini 
3966ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3967ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
3968ed502f03SStefano Zampini         if (AT) {
3969ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
3970ed502f03SStefano Zampini           thrust::advance(rT,-1);
3971ed502f03SStefano Zampini         }
3972ed502f03SStefano Zampini         if (BT) {
3973ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
3974ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
3975ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
3976ed502f03SStefano Zampini         }
3977ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
3978ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
3979ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
3980ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
3981ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
3982ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
3983ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
3984ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3985ed502f03SStefano Zampini 
3986ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
3987ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3988ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3989ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3990ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3991ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3992ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3993ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3994ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3995ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3996ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
3997ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
3998ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
3999ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4000ed502f03SStefano Zampini #endif
4001ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4002ed502f03SStefano Zampini       }
4003ed502f03SStefano Zampini     }
4004ed502f03SStefano Zampini 
4005ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4006ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4007ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4008ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4009ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4010ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4011ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4012ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4013ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4014ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4015ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4016ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4017ed502f03SStefano Zampini     } else {
4018ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4019ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4020ed502f03SStefano Zampini     }
4021ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4022ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4023ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4024ed502f03SStefano Zampini     c->maxnz = c->nz;
4025ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4026ed502f03SStefano Zampini     c->rmax = 0;
4027ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4028ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4029ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4030ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4031ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4032ed502f03SStefano Zampini     }
4033ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4034ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4035ed502f03SStefano Zampini     (*C)->nonzerostate++;
4036ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4037ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4038ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4039ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4040ed502f03SStefano Zampini   } else {
4041ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4042ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4043ed502f03SStefano Zampini     if (c->nz) {
4044ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4045ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4046ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4047ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4048ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4049ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4050ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4051ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4052ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4053ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4054ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4055ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4056ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4057ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4058ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4059ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4060ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4061ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4062ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4063ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4064ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4065ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4066ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4067ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4068ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4069ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4070ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4071ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4072ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4073ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4074ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4075ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4076ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4077ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4078ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4079ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4080ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4081ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4082ed502f03SStefano Zampini       }
4083ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4084ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4085ed502f03SStefano Zampini     }
4086ed502f03SStefano Zampini   }
4087ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4088ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4089ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4090ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4091ed502f03SStefano Zampini   PetscFunctionReturn(0);
4092ed502f03SStefano Zampini }
4093*c215019aSStefano Zampini 
4094*c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4095*c215019aSStefano Zampini {
4096*c215019aSStefano Zampini   PetscErrorCode    ierr;
4097*c215019aSStefano Zampini   bool              dmem;
4098*c215019aSStefano Zampini   const PetscScalar *av;
4099*c215019aSStefano Zampini   cudaError_t       cerr;
4100*c215019aSStefano Zampini 
4101*c215019aSStefano Zampini   PetscFunctionBegin;
4102*c215019aSStefano Zampini   dmem = isCudaMem(v);
4103*c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4104*c215019aSStefano Zampini   if (n && idx) {
4105*c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4106*c215019aSStefano Zampini     widx.assign(idx,idx+n);
4107*c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4108*c215019aSStefano Zampini 
4109*c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4110*c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4111*c215019aSStefano Zampini     if (dmem) {
4112*c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4113*c215019aSStefano Zampini     } else {
4114*c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4115*c215019aSStefano Zampini       dv = w->data();
4116*c215019aSStefano Zampini     }
4117*c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4118*c215019aSStefano Zampini 
4119*c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4120*c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4121*c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4122*c215019aSStefano Zampini     if (w) {
4123*c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4124*c215019aSStefano Zampini     }
4125*c215019aSStefano Zampini     delete w;
4126*c215019aSStefano Zampini   } else {
4127*c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4128*c215019aSStefano Zampini   }
4129*c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4130*c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4131*c215019aSStefano Zampini   PetscFunctionReturn(0);
4132*c215019aSStefano Zampini }
4133