xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a587d1398bd356a7db20a09283619c724e82d622)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67*a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
686fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
759ae82921SPaul Mullowney 
767f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
79ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
827f756511SDominic Meiser 
837e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
847e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
857e8381f9SStefano Zampini 
86b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
87b06137fdSPaul Mullowney {
88b06137fdSPaul Mullowney   cusparseStatus_t   stat;
89b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
90b06137fdSPaul Mullowney 
91b06137fdSPaul Mullowney   PetscFunctionBegin;
92d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
93b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
9457d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
95b06137fdSPaul Mullowney   PetscFunctionReturn(0);
96b06137fdSPaul Mullowney }
97b06137fdSPaul Mullowney 
98b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
99b06137fdSPaul Mullowney {
100b06137fdSPaul Mullowney   cusparseStatus_t   stat;
101b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102b06137fdSPaul Mullowney 
103b06137fdSPaul Mullowney   PetscFunctionBegin;
104d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1056b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10616a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
10757d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
10816a2e217SAlejandro Lamas Daviña     }
109b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1106b1cf21dSAlejandro Lamas Daviña   }
11157d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
112b06137fdSPaul Mullowney   PetscFunctionReturn(0);
113b06137fdSPaul Mullowney }
114b06137fdSPaul Mullowney 
115b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
116b06137fdSPaul Mullowney {
117b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1187e8381f9SStefano Zampini   PetscBool          flg;
1197e8381f9SStefano Zampini   PetscErrorCode     ierr;
120ccdfe979SStefano Zampini 
121b06137fdSPaul Mullowney   PetscFunctionBegin;
1227e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1237e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
124ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
125b06137fdSPaul Mullowney   PetscFunctionReturn(0);
126b06137fdSPaul Mullowney }
127b06137fdSPaul Mullowney 
128ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1299ae82921SPaul Mullowney {
1309ae82921SPaul Mullowney   PetscFunctionBegin;
1319ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1329ae82921SPaul Mullowney   PetscFunctionReturn(0);
1339ae82921SPaul Mullowney }
1349ae82921SPaul Mullowney 
135c708e6cdSJed Brown /*MC
136087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
137087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
138087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
139087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
140087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
141087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
142c708e6cdSJed Brown 
1439ae82921SPaul Mullowney   Level: beginner
144c708e6cdSJed Brown 
1453ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
146c708e6cdSJed Brown M*/
1479ae82921SPaul Mullowney 
14842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1499ae82921SPaul Mullowney {
1509ae82921SPaul Mullowney   PetscErrorCode ierr;
151bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1529ae82921SPaul Mullowney 
1539ae82921SPaul Mullowney   PetscFunctionBegin;
154bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
155bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1562c7c0729SBarry Smith   (*B)->factortype = ftype;
1572c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1589ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1592205254eSKarl Rupp 
160087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16133d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1629ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1639ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
164087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
165087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
166087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1679ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
168bc3f50f2SPaul Mullowney 
169fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1703ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1719ae82921SPaul Mullowney   PetscFunctionReturn(0);
1729ae82921SPaul Mullowney }
1739ae82921SPaul Mullowney 
174bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
175ca45077fSPaul Mullowney {
176aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1776e111a19SKarl Rupp 
178ca45077fSPaul Mullowney   PetscFunctionBegin;
179ca45077fSPaul Mullowney   switch (op) {
180e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
181aa372e3fSPaul Mullowney     cusparsestruct->format = format;
182ca45077fSPaul Mullowney     break;
183e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
184aa372e3fSPaul Mullowney     cusparsestruct->format = format;
185ca45077fSPaul Mullowney     break;
186ca45077fSPaul Mullowney   default:
18736d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
188ca45077fSPaul Mullowney   }
189ca45077fSPaul Mullowney   PetscFunctionReturn(0);
190ca45077fSPaul Mullowney }
1919ae82921SPaul Mullowney 
192e057df02SPaul Mullowney /*@
193e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
194e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
195aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
196e057df02SPaul Mullowney    Not Collective
197e057df02SPaul Mullowney 
198e057df02SPaul Mullowney    Input Parameters:
1998468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20036d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2012692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
202e057df02SPaul Mullowney 
203e057df02SPaul Mullowney    Output Parameter:
204e057df02SPaul Mullowney 
205e057df02SPaul Mullowney    Level: intermediate
206e057df02SPaul Mullowney 
2078468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
208e057df02SPaul Mullowney @*/
209e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
210e057df02SPaul Mullowney {
211e057df02SPaul Mullowney   PetscErrorCode ierr;
2126e111a19SKarl Rupp 
213e057df02SPaul Mullowney   PetscFunctionBegin;
214e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
215e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
216e057df02SPaul Mullowney   PetscFunctionReturn(0);
217e057df02SPaul Mullowney }
218e057df02SPaul Mullowney 
219e6e9a74fSStefano Zampini /*@
220e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
221e6e9a74fSStefano Zampini 
222e6e9a74fSStefano Zampini    Collective on mat
223e6e9a74fSStefano Zampini 
224e6e9a74fSStefano Zampini    Input Parameters:
225e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
226e6e9a74fSStefano Zampini -  transgen - the boolean flag
227e6e9a74fSStefano Zampini 
228e6e9a74fSStefano Zampini    Level: intermediate
229e6e9a74fSStefano Zampini 
230e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
231e6e9a74fSStefano Zampini @*/
232e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
233e6e9a74fSStefano Zampini {
234e6e9a74fSStefano Zampini   PetscErrorCode ierr;
235e6e9a74fSStefano Zampini   PetscBool      flg;
236e6e9a74fSStefano Zampini 
237e6e9a74fSStefano Zampini   PetscFunctionBegin;
238e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
239e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
240e6e9a74fSStefano Zampini   if (flg) {
241e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24254da937aSStefano Zampini 
243e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
244e6e9a74fSStefano Zampini     cusp->transgen = transgen;
24554da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
24654da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
24754da937aSStefano Zampini     }
248e6e9a74fSStefano Zampini   }
249e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
250e6e9a74fSStefano Zampini }
251e6e9a74fSStefano Zampini 
2524416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2539ae82921SPaul Mullowney {
2549ae82921SPaul Mullowney   PetscErrorCode           ierr;
255e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2569ae82921SPaul Mullowney   PetscBool                flg;
257a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2586e111a19SKarl Rupp 
2599ae82921SPaul Mullowney   PetscFunctionBegin;
260e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2619ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26254da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26354da937aSStefano Zampini 
26454da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
265afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
266afb2bd1cSJunchao Zhang 
267e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
268a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
269afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
270afb2bd1cSJunchao Zhang 
2714c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
272a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
273afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
274afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
275afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
276afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
277afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
278afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
279afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
280afb2bd1cSJunchao Zhang 
281afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
282afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
283afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
284afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
285afb2bd1cSJunchao Zhang 
286afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
287afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
288afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
289afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
290afb2bd1cSJunchao Zhang    #endif
2914c87dfd4SPaul Mullowney   }
2920af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2939ae82921SPaul Mullowney   PetscFunctionReturn(0);
2949ae82921SPaul Mullowney }
2959ae82921SPaul Mullowney 
2966fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2979ae82921SPaul Mullowney {
298da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2999ae82921SPaul Mullowney   PetscErrorCode               ierr;
3009ae82921SPaul Mullowney 
3019ae82921SPaul Mullowney   PetscFunctionBegin;
302da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3039ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3049ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3059ae82921SPaul Mullowney   PetscFunctionReturn(0);
3069ae82921SPaul Mullowney }
3079ae82921SPaul Mullowney 
3086fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3099ae82921SPaul Mullowney {
310da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3119ae82921SPaul Mullowney   PetscErrorCode               ierr;
3129ae82921SPaul Mullowney 
3139ae82921SPaul Mullowney   PetscFunctionBegin;
314da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3159ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3169ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3179ae82921SPaul Mullowney   PetscFunctionReturn(0);
3189ae82921SPaul Mullowney }
3199ae82921SPaul Mullowney 
320087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
321087f3262SPaul Mullowney {
322da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
323087f3262SPaul Mullowney   PetscErrorCode               ierr;
324087f3262SPaul Mullowney 
325087f3262SPaul Mullowney   PetscFunctionBegin;
326da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
327087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
328087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
329087f3262SPaul Mullowney   PetscFunctionReturn(0);
330087f3262SPaul Mullowney }
331087f3262SPaul Mullowney 
332087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
333087f3262SPaul Mullowney {
334da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
335087f3262SPaul Mullowney   PetscErrorCode               ierr;
336087f3262SPaul Mullowney 
337087f3262SPaul Mullowney   PetscFunctionBegin;
338da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
339087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
340087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
341087f3262SPaul Mullowney   PetscFunctionReturn(0);
342087f3262SPaul Mullowney }
343087f3262SPaul Mullowney 
344087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3459ae82921SPaul Mullowney {
3469ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3479ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3489ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
349aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3509ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3519ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3529ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3539ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3549ae82921SPaul Mullowney   PetscScalar                       *AALo;
3559ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
356b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
35757d48284SJunchao Zhang   cudaError_t                       cerr;
3589ae82921SPaul Mullowney 
3599ae82921SPaul Mullowney   PetscFunctionBegin;
360cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
361c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3629ae82921SPaul Mullowney     try {
3639ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3649ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
365da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
366da79fbbcSStefano Zampini       if (!loTriFactor) {
3679ae82921SPaul Mullowney 
3689ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
36957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3719ae82921SPaul Mullowney 
3729ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3739ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3749ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3759ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3769ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3779ae82921SPaul Mullowney         v        = aa;
3789ae82921SPaul Mullowney         vi       = aj;
3799ae82921SPaul Mullowney         offset   = 1;
3809ae82921SPaul Mullowney         rowOffset= 1;
3819ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3829ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
383e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3849ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3859ae82921SPaul Mullowney           rowOffset += nz+1;
3869ae82921SPaul Mullowney 
387580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
388580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3899ae82921SPaul Mullowney 
3909ae82921SPaul Mullowney           offset      += nz;
3919ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3929ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3939ae82921SPaul Mullowney           offset      += 1;
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney           v  += nz;
3969ae82921SPaul Mullowney           vi += nz;
3979ae82921SPaul Mullowney         }
3982205254eSKarl Rupp 
399aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
400da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
401da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
402aa372e3fSPaul Mullowney         /* Create the matrix description */
40357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
40457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4051b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
406afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
407afb2bd1cSJunchao Zhang        #else
40857d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
409afb2bd1cSJunchao Zhang        #endif
41057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
412aa372e3fSPaul Mullowney 
413aa372e3fSPaul Mullowney         /* set the operation */
414aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
415aa372e3fSPaul Mullowney 
416aa372e3fSPaul Mullowney         /* set the matrix */
417aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
418aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
419aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
420aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
421aa372e3fSPaul Mullowney 
422aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
423aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
424aa372e3fSPaul Mullowney 
425aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
427aa372e3fSPaul Mullowney 
428aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
430aa372e3fSPaul Mullowney 
431afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
432da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
433afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4341b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
435afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
436afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
437afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
438afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
439afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
440afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
441afb2bd1cSJunchao Zhang       #endif
442afb2bd1cSJunchao Zhang 
443aa372e3fSPaul Mullowney         /* perform the solve analysis */
444aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
445aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
446aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
447afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4481b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
449afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
450afb2bd1cSJunchao Zhang                                #endif
451afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
452da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
453da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
454aa372e3fSPaul Mullowney 
455da79fbbcSStefano Zampini         /* assign the pointer */
456aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4572205254eSKarl Rupp 
45857d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
45957d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4604863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
461da79fbbcSStefano Zampini       } else { /* update values only */
462da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
463da79fbbcSStefano Zampini         AALo[0]  = 1.0;
464da79fbbcSStefano Zampini         v        = aa;
465da79fbbcSStefano Zampini         vi       = aj;
466da79fbbcSStefano Zampini         offset   = 1;
467da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
468da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
469da79fbbcSStefano Zampini           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
470da79fbbcSStefano Zampini           offset      += nz;
471da79fbbcSStefano Zampini           AALo[offset] = 1.0;
472da79fbbcSStefano Zampini           offset      += 1;
473da79fbbcSStefano Zampini           v  += nz;
474da79fbbcSStefano Zampini         }
475da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
476da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
477da79fbbcSStefano Zampini       }
478da79fbbcSStefano Zampini       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
4799ae82921SPaul Mullowney     } catch(char *ex) {
4809ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4819ae82921SPaul Mullowney     }
4829ae82921SPaul Mullowney   }
4839ae82921SPaul Mullowney   PetscFunctionReturn(0);
4849ae82921SPaul Mullowney }
4859ae82921SPaul Mullowney 
486087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4879ae82921SPaul Mullowney {
4889ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4899ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4909ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
491aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4929ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4939ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4949ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4959ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4969ae82921SPaul Mullowney   PetscScalar                       *AAUp;
4979ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4989ae82921SPaul Mullowney   PetscErrorCode                    ierr;
49957d48284SJunchao Zhang   cudaError_t                       cerr;
5009ae82921SPaul Mullowney 
5019ae82921SPaul Mullowney   PetscFunctionBegin;
502cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
503c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5049ae82921SPaul Mullowney     try {
5059ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5069ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
507da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
508da79fbbcSStefano Zampini       if (!upTriFactor) {
5099ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
51057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
51157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5129ae82921SPaul Mullowney 
5139ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5149ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5159ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5169ae82921SPaul Mullowney         offset = nzUpper;
5179ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5189ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5199ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5209ae82921SPaul Mullowney 
521e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5229ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5239ae82921SPaul Mullowney 
524e057df02SPaul Mullowney           /* decrement the offset */
5259ae82921SPaul Mullowney           offset -= (nz+1);
5269ae82921SPaul Mullowney 
527e057df02SPaul Mullowney           /* first, set the diagonal elements */
5289ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
52909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5309ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5319ae82921SPaul Mullowney 
532580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
533580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5349ae82921SPaul Mullowney         }
5352205254eSKarl Rupp 
536aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
537da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
538da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5392205254eSKarl Rupp 
540aa372e3fSPaul Mullowney         /* Create the matrix description */
54157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
54257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5431b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
544afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
545afb2bd1cSJunchao Zhang        #else
54657d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
547afb2bd1cSJunchao Zhang        #endif
54857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
54957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
550aa372e3fSPaul Mullowney 
551aa372e3fSPaul Mullowney         /* set the operation */
552aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
553aa372e3fSPaul Mullowney 
554aa372e3fSPaul Mullowney         /* set the matrix */
555aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
556aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
557aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
558aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
559aa372e3fSPaul Mullowney 
560aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
561aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
562aa372e3fSPaul Mullowney 
563aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
564aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
565aa372e3fSPaul Mullowney 
566aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
567aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
568aa372e3fSPaul Mullowney 
569afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
570da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
571afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5721b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
573afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
574afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
575afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
576afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
577afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
578afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
579afb2bd1cSJunchao Zhang       #endif
580afb2bd1cSJunchao Zhang 
581aa372e3fSPaul Mullowney         /* perform the solve analysis */
582aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
583aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
584aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
585afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5861b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
587afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
588afb2bd1cSJunchao Zhang                                #endif
589afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
590da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
591da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
592aa372e3fSPaul Mullowney 
593da79fbbcSStefano Zampini         /* assign the pointer */
594aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
5952205254eSKarl Rupp 
59657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
59757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
5984863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
599da79fbbcSStefano Zampini       } else {
600da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
601da79fbbcSStefano Zampini         offset = nzUpper;
602da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
603da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
604da79fbbcSStefano Zampini 
605da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
606da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
607da79fbbcSStefano Zampini 
608da79fbbcSStefano Zampini           /* decrement the offset */
609da79fbbcSStefano Zampini           offset -= (nz+1);
610da79fbbcSStefano Zampini 
611da79fbbcSStefano Zampini           /* first, set the diagonal elements */
612da79fbbcSStefano Zampini           AAUp[offset] = 1./v[nz];
613da79fbbcSStefano Zampini           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
614da79fbbcSStefano Zampini         }
615da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
616da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
617da79fbbcSStefano Zampini       }
618da79fbbcSStefano Zampini       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
6199ae82921SPaul Mullowney     } catch(char *ex) {
6209ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6219ae82921SPaul Mullowney     }
6229ae82921SPaul Mullowney   }
6239ae82921SPaul Mullowney   PetscFunctionReturn(0);
6249ae82921SPaul Mullowney }
6259ae82921SPaul Mullowney 
626087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6279ae82921SPaul Mullowney {
6289ae82921SPaul Mullowney   PetscErrorCode               ierr;
6299ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6319ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6329ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6339ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6349ae82921SPaul Mullowney 
6359ae82921SPaul Mullowney   PetscFunctionBegin;
636da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
637087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
638087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6392205254eSKarl Rupp 
640da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
641aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6429ae82921SPaul Mullowney 
643c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
644e057df02SPaul Mullowney   /* lower triangular indices */
6459ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
646da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
647da79fbbcSStefano Zampini     const PetscInt *r;
648da79fbbcSStefano Zampini 
649da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
650aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
651aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6529ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
653da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
654da79fbbcSStefano Zampini   }
6559ae82921SPaul Mullowney 
656e057df02SPaul Mullowney   /* upper triangular indices */
6579ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
658da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
659da79fbbcSStefano Zampini     const PetscInt *c;
660da79fbbcSStefano Zampini 
661da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
662aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
663aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6649ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
665da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
666da79fbbcSStefano Zampini   }
6679ae82921SPaul Mullowney   PetscFunctionReturn(0);
6689ae82921SPaul Mullowney }
6699ae82921SPaul Mullowney 
670087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
671087f3262SPaul Mullowney {
672087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
673087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
674aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
675aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
676087f3262SPaul Mullowney   cusparseStatus_t                  stat;
677087f3262SPaul Mullowney   PetscErrorCode                    ierr;
67857d48284SJunchao Zhang   cudaError_t                       cerr;
679087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
680087f3262SPaul Mullowney   PetscScalar                       *AAUp;
681087f3262SPaul Mullowney   PetscScalar                       *AALo;
682087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
683087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
684087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
685087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
686087f3262SPaul Mullowney 
687087f3262SPaul Mullowney   PetscFunctionBegin;
688cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
689c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
690087f3262SPaul Mullowney     try {
691da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
692da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
693da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
694087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
69557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
69657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
697087f3262SPaul Mullowney 
698087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
699087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
700087f3262SPaul Mullowney         AiUp[n]=nzUpper;
701087f3262SPaul Mullowney         offset = 0;
702087f3262SPaul Mullowney         for (i=0; i<n; i++) {
703087f3262SPaul Mullowney           /* set the pointers */
704087f3262SPaul Mullowney           v  = aa + ai[i];
705087f3262SPaul Mullowney           vj = aj + ai[i];
706087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
707087f3262SPaul Mullowney 
708087f3262SPaul Mullowney           /* first, set the diagonal elements */
709087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
71009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
711087f3262SPaul Mullowney           AiUp[i]      = offset;
71209f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
713087f3262SPaul Mullowney 
714087f3262SPaul Mullowney           offset+=1;
715087f3262SPaul Mullowney           if (nz>0) {
716f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
717580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
718087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
719087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
720087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
721087f3262SPaul Mullowney             }
722087f3262SPaul Mullowney             offset+=nz;
723087f3262SPaul Mullowney           }
724087f3262SPaul Mullowney         }
725087f3262SPaul Mullowney 
726aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
727da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
728da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
729087f3262SPaul Mullowney 
730aa372e3fSPaul Mullowney         /* Create the matrix description */
73157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
73257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7331b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
734afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
735afb2bd1cSJunchao Zhang        #else
73657d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
737afb2bd1cSJunchao Zhang        #endif
73857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
73957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
740087f3262SPaul Mullowney 
741aa372e3fSPaul Mullowney         /* set the matrix */
742aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
743aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
744aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
745aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
746aa372e3fSPaul Mullowney 
747aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
748aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
749aa372e3fSPaul Mullowney 
750aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
751aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
752aa372e3fSPaul Mullowney 
753aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
754aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
755aa372e3fSPaul Mullowney 
756afb2bd1cSJunchao Zhang         /* set the operation */
757afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
758afb2bd1cSJunchao Zhang 
759afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
760da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
761afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7621b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
764afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
765afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
766afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
767afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
768afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
769afb2bd1cSJunchao Zhang       #endif
770afb2bd1cSJunchao Zhang 
771aa372e3fSPaul Mullowney         /* perform the solve analysis */
772aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
773aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
774aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
775afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7761b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
777afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
778afb2bd1cSJunchao Zhang                                 #endif
779afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
780da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
781da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
782aa372e3fSPaul Mullowney 
783da79fbbcSStefano Zampini         /* assign the pointer */
784aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
785aa372e3fSPaul Mullowney 
786aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
787da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
788da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
789aa372e3fSPaul Mullowney 
790aa372e3fSPaul Mullowney         /* Create the matrix description */
79157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
79257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7931b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
794afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
795afb2bd1cSJunchao Zhang        #else
79657d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
797afb2bd1cSJunchao Zhang        #endif
79857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
79957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
800aa372e3fSPaul Mullowney 
801aa372e3fSPaul Mullowney         /* set the operation */
802aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
803aa372e3fSPaul Mullowney 
804aa372e3fSPaul Mullowney         /* set the matrix */
805aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
806aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
807aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
808aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
809aa372e3fSPaul Mullowney 
810aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
811aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
812aa372e3fSPaul Mullowney 
813aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
814aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
815aa372e3fSPaul Mullowney 
816aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
817aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
818aa372e3fSPaul Mullowney 
819afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
820da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
821afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8221b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
824afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
825afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
826afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
827afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
828afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
829afb2bd1cSJunchao Zhang       #endif
830afb2bd1cSJunchao Zhang 
831aa372e3fSPaul Mullowney         /* perform the solve analysis */
832aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
833aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
834aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
835afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8361b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
837afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
838afb2bd1cSJunchao Zhang                                 #endif
839afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
840da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
841da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
842aa372e3fSPaul Mullowney 
843da79fbbcSStefano Zampini         /* assign the pointer */
844aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
845087f3262SPaul Mullowney 
846da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
84757d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
84857d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
849da79fbbcSStefano Zampini       } else {
850da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
851da79fbbcSStefano Zampini         offset = 0;
852da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
853da79fbbcSStefano Zampini           /* set the pointers */
854da79fbbcSStefano Zampini           v  = aa + ai[i];
855da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
856da79fbbcSStefano Zampini 
857da79fbbcSStefano Zampini           /* first, set the diagonal elements */
858da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
859da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
860da79fbbcSStefano Zampini 
861da79fbbcSStefano Zampini           offset+=1;
862da79fbbcSStefano Zampini           if (nz>0) {
863da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
864da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
865da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
866da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
867da79fbbcSStefano Zampini             }
868da79fbbcSStefano Zampini             offset+=nz;
869da79fbbcSStefano Zampini           }
870da79fbbcSStefano Zampini         }
871da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
872da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
873da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
874da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
875da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
876da79fbbcSStefano Zampini       }
87757d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
87857d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
879087f3262SPaul Mullowney     } catch(char *ex) {
880087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
881087f3262SPaul Mullowney     }
882087f3262SPaul Mullowney   }
883087f3262SPaul Mullowney   PetscFunctionReturn(0);
884087f3262SPaul Mullowney }
885087f3262SPaul Mullowney 
886087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8879ae82921SPaul Mullowney {
8889ae82921SPaul Mullowney   PetscErrorCode               ierr;
889087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
890087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
891087f3262SPaul Mullowney   IS                           ip = a->row;
892087f3262SPaul Mullowney   PetscBool                    perm_identity;
893087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
894087f3262SPaul Mullowney 
895087f3262SPaul Mullowney   PetscFunctionBegin;
896da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
897087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
898da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
899aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
900aa372e3fSPaul Mullowney 
901da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
902da79fbbcSStefano Zampini 
903087f3262SPaul Mullowney   /* lower triangular indices */
904087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
905087f3262SPaul Mullowney   if (!perm_identity) {
9064e4bbfaaSStefano Zampini     IS             iip;
907da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9084e4bbfaaSStefano Zampini 
9094e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9104e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
911da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
912aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
913aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
914aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9154e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9164e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9174e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
918087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
919da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
920da79fbbcSStefano Zampini   }
921087f3262SPaul Mullowney   PetscFunctionReturn(0);
922087f3262SPaul Mullowney }
923087f3262SPaul Mullowney 
9246fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9259ae82921SPaul Mullowney {
9269ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9279ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9289ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
929b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9309ae82921SPaul Mullowney 
9319ae82921SPaul Mullowney   PetscFunctionBegin;
9329ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
933ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
934e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9359ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9369ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
937bda325fcSPaul Mullowney   if (row_identity && col_identity) {
938bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
939bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9404e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9414e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
942bda325fcSPaul Mullowney   } else {
943bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
944bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9454e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9464e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
947bda325fcSPaul Mullowney   }
9488dc1d2a3SPaul Mullowney 
949e057df02SPaul Mullowney   /* get the triangular factors */
950087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9519ae82921SPaul Mullowney   PetscFunctionReturn(0);
9529ae82921SPaul Mullowney }
9539ae82921SPaul Mullowney 
954087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
955087f3262SPaul Mullowney {
956087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
957087f3262SPaul Mullowney   IS             ip = b->row;
958087f3262SPaul Mullowney   PetscBool      perm_identity;
959b175d8bbSPaul Mullowney   PetscErrorCode ierr;
960087f3262SPaul Mullowney 
961087f3262SPaul Mullowney   PetscFunctionBegin;
962087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
963ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
964087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
965087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
966087f3262SPaul Mullowney   if (perm_identity) {
967087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9694e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9704e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
971087f3262SPaul Mullowney   } else {
972087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9744e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9754e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
976087f3262SPaul Mullowney   }
977087f3262SPaul Mullowney 
978087f3262SPaul Mullowney   /* get the triangular factors */
979087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
980087f3262SPaul Mullowney   PetscFunctionReturn(0);
981087f3262SPaul Mullowney }
9829ae82921SPaul Mullowney 
983b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984bda325fcSPaul Mullowney {
985bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
991aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
992aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
993aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
994aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
9951b0a6780SStefano Zampini   cudaError_t                       cerr;
996da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
997b175d8bbSPaul Mullowney 
998bda325fcSPaul Mullowney   PetscFunctionBegin;
999aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1000da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1001da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002aa372e3fSPaul Mullowney 
1003aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1004aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1005aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009aa372e3fSPaul Mullowney 
1010aa372e3fSPaul Mullowney   /* Create the matrix description */
101157d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
101257d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
101357d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
101457d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
101557d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* set the operation */
1018aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019aa372e3fSPaul Mullowney 
1020aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1021aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1022afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028aa372e3fSPaul Mullowney 
1029aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1034afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1035afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1036afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1037afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1039afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10401b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1041afb2bd1cSJunchao Zhang #endif
1042afb2bd1cSJunchao Zhang 
1043da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1044aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1047aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1048aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1049aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1050afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1053afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1054afb2bd1cSJunchao Zhang                         #else
1055afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1057afb2bd1cSJunchao Zhang                         #endif
1058afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1059da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1060da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1061aa372e3fSPaul Mullowney 
1062afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1063da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1064afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1071afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1072afb2bd1cSJunchao Zhang #endif
1073afb2bd1cSJunchao Zhang 
1074afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1075aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10791b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1081afb2bd1cSJunchao Zhang                           #endif
1082afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1083da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1084da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1085aa372e3fSPaul Mullowney 
1086da79fbbcSStefano Zampini   /* assign the pointer */
1087aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /*********************************************/
1090aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1091aa372e3fSPaul Mullowney   /*********************************************/
1092aa372e3fSPaul Mullowney 
1093aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1094da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1095da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1096aa372e3fSPaul Mullowney 
1097aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1098aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1099aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1100aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1101aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1102aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1103aa372e3fSPaul Mullowney 
1104aa372e3fSPaul Mullowney   /* Create the matrix description */
110557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
110657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
110757d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
110857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
110957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1110aa372e3fSPaul Mullowney 
1111aa372e3fSPaul Mullowney   /* set the operation */
1112aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1113aa372e3fSPaul Mullowney 
1114aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1115aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1116afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1117afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1118aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1119afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1120afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1121afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1122aa372e3fSPaul Mullowney 
1123aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1124afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1126afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1127afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1128afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1129afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1130afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1131afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1132afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1133afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1135afb2bd1cSJunchao Zhang #endif
1136afb2bd1cSJunchao Zhang 
1137da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1138aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1139aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1140aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1141aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1142aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1143aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1144afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1145afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1147afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1148afb2bd1cSJunchao Zhang                         #else
1149afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1150afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1151afb2bd1cSJunchao Zhang                         #endif
1152afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1153da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1155aa372e3fSPaul Mullowney 
1156afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1157da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11591b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1160afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1161afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1162afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1163afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1164afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1165afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1166afb2bd1cSJunchao Zhang   #endif
1167afb2bd1cSJunchao Zhang 
1168afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1169aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1170afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1171afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11731b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1175afb2bd1cSJunchao Zhang                           #endif
1176afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1177da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1178da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1179aa372e3fSPaul Mullowney 
1180da79fbbcSStefano Zampini   /* assign the pointer */
1181aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1182bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1183bda325fcSPaul Mullowney }
1184bda325fcSPaul Mullowney 
1185b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1186bda325fcSPaul Mullowney {
1187aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1188aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1189aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1190bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1191bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1192aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1193b06137fdSPaul Mullowney   cudaError_t                  err;
119485ba7357SStefano Zampini   PetscErrorCode               ierr;
1195b175d8bbSPaul Mullowney 
1196bda325fcSPaul Mullowney   PetscFunctionBegin;
119785ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
119885ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
119985ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
120085ba7357SStefano Zampini   /* create cusparse matrix */
1201aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
120257d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1203aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
120457d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
120557d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1206aa372e3fSPaul Mullowney 
1207b06137fdSPaul Mullowney   /* set alpha and beta */
1208afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12097656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12107656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1211afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12127656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12137656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
121457d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1215b06137fdSPaul Mullowney 
1216aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1217aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1218aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1219554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1220554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1221aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1222a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1223aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1224aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1225a3fdcf43SKarl Rupp 
122681902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
122781902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1228afb2bd1cSJunchao Zhang 
122981902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1230afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1231afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1232afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1233afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1234afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1235afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1236afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1237afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1238afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1239afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1240afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1241afb2bd1cSJunchao Zhang    #endif
1242afb2bd1cSJunchao Zhang 
1243a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1244a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1245aa372e3fSPaul Mullowney                             matrix->values->data().get(),
124681902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1247aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1248aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1249afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1250afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1251afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1252afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1253afb2bd1cSJunchao Zhang                           #else
1254afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1255afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1256afb2bd1cSJunchao Zhang                           #endif
1257afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
1258aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1259afb2bd1cSJunchao Zhang 
1260afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1261afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1262afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1263afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1264afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1265afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1267afb2bd1cSJunchao Zhang    #endif
1268aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1269afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1270afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1271afb2bd1cSJunchao Zhang    #else
1272aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
127351c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
127451c6d536SStefano Zampini     /* First convert HYB to CSR */
1275aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1276aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1277aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1278aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1279aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1280aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1281aa372e3fSPaul Mullowney 
1282aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1283aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1284aa372e3fSPaul Mullowney                             temp->values->data().get(),
1285aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
128657d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1287aa372e3fSPaul Mullowney 
1288aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1289aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1290aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1291aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1292aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1293aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1294aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1295aa372e3fSPaul Mullowney 
1296aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1297aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1298aa372e3fSPaul Mullowney                             temp->values->data().get(),
1299aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1300aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1301aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1302aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1303aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
130457d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1305aa372e3fSPaul Mullowney 
1306aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1307aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
130857d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1309aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1310aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1311aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1312aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1313aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1314aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
131557d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1316aa372e3fSPaul Mullowney 
1317aa372e3fSPaul Mullowney     /* assign the pointer */
1318aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1319aa372e3fSPaul Mullowney     /* delete temporaries */
1320aa372e3fSPaul Mullowney     if (tempT) {
1321aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1322aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1323aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1324aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1325087f3262SPaul Mullowney     }
1326aa372e3fSPaul Mullowney     if (temp) {
1327aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1328aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1329aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1330aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1331aa372e3fSPaul Mullowney     }
1332afb2bd1cSJunchao Zhang    #endif
1333aa372e3fSPaul Mullowney   }
133405035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
133585ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
133685ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1337213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1338213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1339aa372e3fSPaul Mullowney   /* assign the pointer */
1340aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1341bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1342bda325fcSPaul Mullowney }
1343bda325fcSPaul Mullowney 
13444e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13456fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1346bda325fcSPaul Mullowney {
1347c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1348465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1349465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1350465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1351465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1352bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1353bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1354aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1355aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1356aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1357b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
135857d48284SJunchao Zhang   cudaError_t                           cerr;
1359bda325fcSPaul Mullowney 
1360bda325fcSPaul Mullowney   PetscFunctionBegin;
1361aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1362aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1363bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1364aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1365aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1366bda325fcSPaul Mullowney   }
1367bda325fcSPaul Mullowney 
1368bda325fcSPaul Mullowney   /* Get the GPU pointers */
1369c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1370c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1371c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1372c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1373bda325fcSPaul Mullowney 
13747a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1375aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1376c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1377c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1378c41cb2e2SAlejandro Lamas Daviña                xGPU);
1379aa372e3fSPaul Mullowney 
1380aa372e3fSPaul Mullowney   /* First, solve U */
1381aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1382afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13831b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1384afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1385afb2bd1cSJunchao Zhang                       #endif
1386afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1387aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1388aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1389aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1390aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1391afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
13921b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1393afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1394afb2bd1cSJunchao Zhang                       #endif
1395afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1396aa372e3fSPaul Mullowney 
1397aa372e3fSPaul Mullowney   /* Then, solve L */
1398aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1399afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14001b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1401afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1402afb2bd1cSJunchao Zhang                       #endif
1403afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1404aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1405aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1406aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1407aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1408afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14091b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1410afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1411afb2bd1cSJunchao Zhang                       #endif
1412afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1413aa372e3fSPaul Mullowney 
1414aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1415c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1416c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1417aa372e3fSPaul Mullowney                tempGPU->begin());
1418aa372e3fSPaul Mullowney 
1419aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1420c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1421bda325fcSPaul Mullowney 
1422bda325fcSPaul Mullowney   /* restore */
1423c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1424c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
142505035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1426661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1427958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1428bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1429bda325fcSPaul Mullowney }
1430bda325fcSPaul Mullowney 
14316fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1432bda325fcSPaul Mullowney {
1433465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1434465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1435bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1436bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1437aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1438aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1439aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1440b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
144157d48284SJunchao Zhang   cudaError_t                       cerr;
1442bda325fcSPaul Mullowney 
1443bda325fcSPaul Mullowney   PetscFunctionBegin;
1444aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1445aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1446bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1447aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1448aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1449bda325fcSPaul Mullowney   }
1450bda325fcSPaul Mullowney 
1451bda325fcSPaul Mullowney   /* Get the GPU pointers */
1452c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1453c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1454bda325fcSPaul Mullowney 
14557a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1456aa372e3fSPaul Mullowney   /* First, solve U */
1457aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1458afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14591b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1460afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1461afb2bd1cSJunchao Zhang                       #endif
1462afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1463aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1464aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1465aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1466aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1467afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
14681b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1469afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1470afb2bd1cSJunchao Zhang                       #endif
1471afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1472aa372e3fSPaul Mullowney 
1473aa372e3fSPaul Mullowney   /* Then, solve L */
1474aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1475afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14761b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1478afb2bd1cSJunchao Zhang                       #endif
1479afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1480aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1481aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1482aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1483aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1484afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14851b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1487afb2bd1cSJunchao Zhang                       #endif
1488afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1489bda325fcSPaul Mullowney 
1490bda325fcSPaul Mullowney   /* restore */
1491c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1492c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
149305035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1494661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1495958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1496bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1497bda325fcSPaul Mullowney }
1498bda325fcSPaul Mullowney 
14996fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15009ae82921SPaul Mullowney {
1501465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1502465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1503465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1504465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15059ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15069ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1507aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1508aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1509aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1510b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
151157d48284SJunchao Zhang   cudaError_t                           cerr;
15129ae82921SPaul Mullowney 
15139ae82921SPaul Mullowney   PetscFunctionBegin;
1514ebc8f436SDominic Meiser 
1515e057df02SPaul Mullowney   /* Get the GPU pointers */
1516c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1517c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1518c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1519c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15209ae82921SPaul Mullowney 
15217a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1522aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1523c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1524c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15254e4bbfaaSStefano Zampini                tempGPU->begin());
1526aa372e3fSPaul Mullowney 
1527aa372e3fSPaul Mullowney   /* Next, solve L */
1528aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1529afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15301b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1531afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1532afb2bd1cSJunchao Zhang                       #endif
1533afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1534aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1535aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1536aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1537aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1538afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15391b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1540afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1541afb2bd1cSJunchao Zhang                       #endif
1542afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1543aa372e3fSPaul Mullowney 
1544aa372e3fSPaul Mullowney   /* Then, solve U */
1545aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1546afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15471b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1548afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1549afb2bd1cSJunchao Zhang                       #endif
1550afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1551aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1552aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1553aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1554aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1555afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
15561b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1557afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1558afb2bd1cSJunchao Zhang                       #endif
1559afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1560aa372e3fSPaul Mullowney 
15614e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
15624e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15634e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15644e4bbfaaSStefano Zampini                xGPU);
15659ae82921SPaul Mullowney 
1566c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1567c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
156805035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1569661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1570958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15719ae82921SPaul Mullowney   PetscFunctionReturn(0);
15729ae82921SPaul Mullowney }
15739ae82921SPaul Mullowney 
15746fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15759ae82921SPaul Mullowney {
1576465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1577465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15789ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15799ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1580aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1581aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1582aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1583b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
158457d48284SJunchao Zhang   cudaError_t                       cerr;
15859ae82921SPaul Mullowney 
15869ae82921SPaul Mullowney   PetscFunctionBegin;
1587e057df02SPaul Mullowney   /* Get the GPU pointers */
1588c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1589c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
15909ae82921SPaul Mullowney 
15917a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1592aa372e3fSPaul Mullowney   /* First, solve L */
1593aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1594afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15951b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1596afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1597afb2bd1cSJunchao Zhang                       #endif
1598afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1599aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1600aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1601aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1602aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1603afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16041b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1605afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1606afb2bd1cSJunchao Zhang                       #endif
1607afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1608aa372e3fSPaul Mullowney 
1609aa372e3fSPaul Mullowney   /* Next, solve U */
1610aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1611afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16121b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1614afb2bd1cSJunchao Zhang                       #endif
1615afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1616aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1617aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1618aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1619aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1620afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16211b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1622afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1623afb2bd1cSJunchao Zhang                       #endif
1624afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
16259ae82921SPaul Mullowney 
1626c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1627c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
162805035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1629661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1630958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16319ae82921SPaul Mullowney   PetscFunctionReturn(0);
16329ae82921SPaul Mullowney }
16339ae82921SPaul Mullowney 
16347e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16357e8381f9SStefano Zampini {
16367e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16377e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16387e8381f9SStefano Zampini   cudaError_t        cerr;
16397e8381f9SStefano Zampini   PetscErrorCode     ierr;
16407e8381f9SStefano Zampini 
16417e8381f9SStefano Zampini   PetscFunctionBegin;
16427e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16437e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16447e8381f9SStefano Zampini 
16457e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16467e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16477e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16487e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
16497e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16507e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
16517e8381f9SStefano Zampini   }
16527e8381f9SStefano Zampini   PetscFunctionReturn(0);
16537e8381f9SStefano Zampini }
16547e8381f9SStefano Zampini 
16557e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
16567e8381f9SStefano Zampini {
16577e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
16587e8381f9SStefano Zampini   PetscErrorCode ierr;
16597e8381f9SStefano Zampini 
16607e8381f9SStefano Zampini   PetscFunctionBegin;
16617e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
16627e8381f9SStefano Zampini   *array = a->a;
16637e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
16647e8381f9SStefano Zampini   PetscFunctionReturn(0);
16657e8381f9SStefano Zampini }
16667e8381f9SStefano Zampini 
16676fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
16689ae82921SPaul Mullowney {
1669aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
16707c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
16719ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1672213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
16739ae82921SPaul Mullowney   PetscErrorCode               ierr;
1674aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1675b06137fdSPaul Mullowney   cudaError_t                  err;
16769ae82921SPaul Mullowney 
16779ae82921SPaul Mullowney   PetscFunctionBegin;
167895639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1679c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
168081902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
168181902715SJunchao Zhang       /* Copy values only */
1682afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1683afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
168485ba7357SStefano Zampini 
168585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1686afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
168705035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
16884863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
168985ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
169081902715SJunchao Zhang 
169181902715SJunchao Zhang       /* Update matT when it was built before */
169281902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
169381902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1694afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
169585ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
169681902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1697afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1698afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
169981902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1700afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1701afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1702afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1703afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1704afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1705afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1706afb2bd1cSJunchao Zhang                           #else
1707afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1708afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1709afb2bd1cSJunchao Zhang                           #endif
1710afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
171105035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
171285ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
171381902715SJunchao Zhang       }
171434d6c7a5SJose E. Roman     } else {
171585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17167c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
17177c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
17187c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
171981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
17209ae82921SPaul Mullowney       try {
17219ae82921SPaul Mullowney         if (a->compressedrow.use) {
17229ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17239ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17249ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17259ae82921SPaul Mullowney         } else {
1726213423ffSJunchao Zhang           m    = A->rmap->n;
1727213423ffSJunchao Zhang           ii   = a->i;
1728e6e9a74fSStefano Zampini           ridx = NULL;
17299ae82921SPaul Mullowney         }
1730213423ffSJunchao Zhang         cusparsestruct->nrows = m;
17319ae82921SPaul Mullowney 
173285ba7357SStefano Zampini         /* create cusparse matrix */
1733aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
173457d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
173557d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
173657d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17379ae82921SPaul Mullowney 
1738afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17397656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17407656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1741afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17427656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17437656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
174457d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1745b06137fdSPaul Mullowney 
1746aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1747aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1748aa372e3fSPaul Mullowney           /* set the matrix */
1749afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1750afb2bd1cSJunchao Zhang           mat->num_rows = m;
1751afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1752afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1753afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1754afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17559ae82921SPaul Mullowney 
1756afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1757afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1758aa372e3fSPaul Mullowney 
1759afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1760afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1761aa372e3fSPaul Mullowney 
1762aa372e3fSPaul Mullowney           /* assign the pointer */
1763afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1764afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1765afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1766afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1767afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1768afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1769afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1770afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1771afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1772afb2bd1cSJunchao Zhang           }
1773afb2bd1cSJunchao Zhang          #endif
1774aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1775afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1776afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1777afb2bd1cSJunchao Zhang          #else
1778afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1779afb2bd1cSJunchao Zhang           mat->num_rows = m;
1780afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1781afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1782afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1783afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1784aa372e3fSPaul Mullowney 
1785afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1786afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1787aa372e3fSPaul Mullowney 
1788afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1789afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1790aa372e3fSPaul Mullowney 
1791aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
179257d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1793aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1794aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1795afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1796afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1797afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1798afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
179957d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1800aa372e3fSPaul Mullowney           /* assign the pointer */
1801aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1802aa372e3fSPaul Mullowney 
1803afb2bd1cSJunchao Zhang           if (mat) {
1804afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1805afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1806afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1807afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1808087f3262SPaul Mullowney           }
1809afb2bd1cSJunchao Zhang          #endif
1810087f3262SPaul Mullowney         }
1811ca45077fSPaul Mullowney 
1812aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1813213423ffSJunchao Zhang         if (a->compressedrow.use) {
1814213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1815aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1816aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1817213423ffSJunchao Zhang           tmp = m;
1818213423ffSJunchao Zhang         } else {
1819213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1820213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1821213423ffSJunchao Zhang           tmp = 0;
1822213423ffSJunchao Zhang         }
1823213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1824aa372e3fSPaul Mullowney 
1825aa372e3fSPaul Mullowney         /* assign the pointer */
1826aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18279ae82921SPaul Mullowney       } catch(char *ex) {
18289ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18299ae82921SPaul Mullowney       }
183005035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
183185ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
183234d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
183334d6c7a5SJose E. Roman     }
1834c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
18359ae82921SPaul Mullowney   }
18369ae82921SPaul Mullowney   PetscFunctionReturn(0);
18379ae82921SPaul Mullowney }
18389ae82921SPaul Mullowney 
1839c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1840aa372e3fSPaul Mullowney {
1841aa372e3fSPaul Mullowney   template <typename Tuple>
1842aa372e3fSPaul Mullowney   __host__ __device__
1843aa372e3fSPaul Mullowney   void operator()(Tuple t)
1844aa372e3fSPaul Mullowney   {
1845aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1846aa372e3fSPaul Mullowney   }
1847aa372e3fSPaul Mullowney };
1848aa372e3fSPaul Mullowney 
18497e8381f9SStefano Zampini struct VecCUDAEquals
18507e8381f9SStefano Zampini {
18517e8381f9SStefano Zampini   template <typename Tuple>
18527e8381f9SStefano Zampini   __host__ __device__
18537e8381f9SStefano Zampini   void operator()(Tuple t)
18547e8381f9SStefano Zampini   {
18557e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18567e8381f9SStefano Zampini   }
18577e8381f9SStefano Zampini };
18587e8381f9SStefano Zampini 
1859e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1860e6e9a74fSStefano Zampini {
1861e6e9a74fSStefano Zampini   template <typename Tuple>
1862e6e9a74fSStefano Zampini   __host__ __device__
1863e6e9a74fSStefano Zampini   void operator()(Tuple t)
1864e6e9a74fSStefano Zampini   {
1865e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1866e6e9a74fSStefano Zampini   }
1867e6e9a74fSStefano Zampini };
1868e6e9a74fSStefano Zampini 
1869afb2bd1cSJunchao Zhang struct MatMatCusparse {
1870ccdfe979SStefano Zampini   PetscBool            cisdense;
1871ccdfe979SStefano Zampini   PetscScalar          *Bt;
1872ccdfe979SStefano Zampini   Mat                  X;
1873afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1874afb2bd1cSJunchao Zhang   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1875afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
1876afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
1877afb2bd1cSJunchao Zhang   size_t               spmmBufferSize;
1878afb2bd1cSJunchao Zhang   void                 *spmmBuffer;
1879afb2bd1cSJunchao Zhang   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1880afb2bd1cSJunchao Zhang #endif
1881afb2bd1cSJunchao Zhang };
1882ccdfe979SStefano Zampini 
1883ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1884ccdfe979SStefano Zampini {
1885ccdfe979SStefano Zampini   PetscErrorCode ierr;
1886ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1887ccdfe979SStefano Zampini   cudaError_t    cerr;
1888ccdfe979SStefano Zampini 
1889ccdfe979SStefano Zampini   PetscFunctionBegin;
1890ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1891afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1892afb2bd1cSJunchao Zhang   cusparseStatus_t stat;
1893afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1894afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1895afb2bd1cSJunchao Zhang   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1896afb2bd1cSJunchao Zhang  #endif
1897ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1898ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1899ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1900ccdfe979SStefano Zampini }
1901ccdfe979SStefano Zampini 
1902ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1903ccdfe979SStefano Zampini 
1904ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1905ccdfe979SStefano Zampini {
1906ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1907ccdfe979SStefano Zampini   Mat                          A,B;
1908afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1909ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1910ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1911ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1912ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1913ccdfe979SStefano Zampini   const PetscScalar            *barray;
1914ccdfe979SStefano Zampini   PetscScalar                  *carray;
1915ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1916ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1917ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1918ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1919afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1920ccdfe979SStefano Zampini 
1921ccdfe979SStefano Zampini   PetscFunctionBegin;
1922ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1923ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1924ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1925ccdfe979SStefano Zampini   A    = product->A;
1926ccdfe979SStefano Zampini   B    = product->B;
1927ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1928ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1929ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1930ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1931ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1932ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1933ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1934ccdfe979SStefano Zampini   switch (product->type) {
1935ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1936ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1937ccdfe979SStefano Zampini     mat = cusp->mat;
1938ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1939ccdfe979SStefano Zampini     m   = A->rmap->n;
1940ccdfe979SStefano Zampini     n   = B->cmap->n;
1941ccdfe979SStefano Zampini     break;
1942ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1943e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1944e6e9a74fSStefano Zampini       mat = cusp->mat;
1945e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1946e6e9a74fSStefano Zampini     } else {
1947ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1948ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1949ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1950e6e9a74fSStefano Zampini     }
1951ccdfe979SStefano Zampini     m = A->cmap->n;
1952ccdfe979SStefano Zampini     n = B->cmap->n;
1953ccdfe979SStefano Zampini     break;
1954ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1955ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1956ccdfe979SStefano Zampini     mat = cusp->mat;
1957ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1958ccdfe979SStefano Zampini     m   = A->rmap->n;
1959ccdfe979SStefano Zampini     n   = B->rmap->n;
1960ccdfe979SStefano Zampini     break;
1961ccdfe979SStefano Zampini   default:
1962ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1963ccdfe979SStefano Zampini   }
1964ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1965ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1966ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1967ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1968afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1969ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1970afb2bd1cSJunchao Zhang 
1971ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1972c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1973c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1974c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1975c8378d12SStefano Zampini   } else {
1976c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1977c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1978c8378d12SStefano Zampini   }
1979c8378d12SStefano Zampini 
1980c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1981afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1982afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1983afb2bd1cSJunchao Zhang   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1984afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1985afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1986afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
1987afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1988afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
1989afb2bd1cSJunchao Zhang     }
1990c8378d12SStefano Zampini 
1991afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1992afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1993afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1994afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
1995afb2bd1cSJunchao Zhang     }
1996afb2bd1cSJunchao Zhang 
1997afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
1998afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
1999afb2bd1cSJunchao Zhang                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2000afb2bd1cSJunchao Zhang                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2001afb2bd1cSJunchao Zhang                               csrmat->values->data().get(),
2002afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2003afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2004afb2bd1cSJunchao Zhang     }
2005afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2006afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2007afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2008afb2bd1cSJunchao Zhang                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2009afb2bd1cSJunchao Zhang     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2010afb2bd1cSJunchao Zhang     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2011afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2012afb2bd1cSJunchao Zhang   } else {
2013afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2014afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2015afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2016afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2017afb2bd1cSJunchao Zhang   }
2018afb2bd1cSJunchao Zhang 
2019afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2020afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2021afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2022afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2023afb2bd1cSJunchao Zhang                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2024afb2bd1cSJunchao Zhang  #else
2025afb2bd1cSJunchao Zhang   PetscInt k;
2026afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2027ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2028ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2029ccdfe979SStefano Zampini     cublasStatus_t cerr;
2030ccdfe979SStefano Zampini 
2031ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2032ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2033ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2034ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2035ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2036ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2037ccdfe979SStefano Zampini     blda = B->cmap->n;
2038afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2039afb2bd1cSJunchao Zhang   } else {
2040afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2041ccdfe979SStefano Zampini   }
2042ccdfe979SStefano Zampini 
2043afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2044ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2045afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2046ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2047ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2048ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2049ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2050ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2051afb2bd1cSJunchao Zhang  #endif
2052afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2053c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2054c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2055ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2056ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2057ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2058ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2059ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2060ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2061ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2062ccdfe979SStefano Zampini   } else {
2063ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2064ccdfe979SStefano Zampini   }
2065ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2066ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2067ccdfe979SStefano Zampini   }
2068ccdfe979SStefano Zampini   if (!biscuda) {
2069ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2070ccdfe979SStefano Zampini   }
2071ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2072ccdfe979SStefano Zampini }
2073ccdfe979SStefano Zampini 
2074ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2075ccdfe979SStefano Zampini {
2076ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2077ccdfe979SStefano Zampini   Mat                A,B;
2078ccdfe979SStefano Zampini   PetscInt           m,n;
2079ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2080ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2081ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2082ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2083ccdfe979SStefano Zampini 
2084ccdfe979SStefano Zampini   PetscFunctionBegin;
2085ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2086ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2087ccdfe979SStefano Zampini   A    = product->A;
2088ccdfe979SStefano Zampini   B    = product->B;
2089ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2090ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2091ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2092ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2093ccdfe979SStefano Zampini   switch (product->type) {
2094ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2095ccdfe979SStefano Zampini     m = A->rmap->n;
2096ccdfe979SStefano Zampini     n = B->cmap->n;
2097ccdfe979SStefano Zampini     break;
2098ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2099ccdfe979SStefano Zampini     m = A->cmap->n;
2100ccdfe979SStefano Zampini     n = B->cmap->n;
2101ccdfe979SStefano Zampini     break;
2102ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2103ccdfe979SStefano Zampini     m = A->rmap->n;
2104ccdfe979SStefano Zampini     n = B->rmap->n;
2105ccdfe979SStefano Zampini     break;
2106ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2107ccdfe979SStefano Zampini     m = B->cmap->n;
2108ccdfe979SStefano Zampini     n = B->cmap->n;
2109ccdfe979SStefano Zampini     break;
2110ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2111ccdfe979SStefano Zampini     m = B->rmap->n;
2112ccdfe979SStefano Zampini     n = B->rmap->n;
2113ccdfe979SStefano Zampini     break;
2114ccdfe979SStefano Zampini   default:
2115ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2116ccdfe979SStefano Zampini   }
2117ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2118ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2119ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2120ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2121ccdfe979SStefano Zampini 
2122ccdfe979SStefano Zampini   /* product data */
2123ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2124ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2125afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2126afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2127ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2128afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2129ccdfe979SStefano Zampini   }
2130afb2bd1cSJunchao Zhang  #endif
2131ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2132ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2133ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2134ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2135ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2136ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2137ccdfe979SStefano Zampini     } else {
2138ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2139ccdfe979SStefano Zampini     }
2140ccdfe979SStefano Zampini   }
2141ccdfe979SStefano Zampini   C->product->data    = mmdata;
2142ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2143ccdfe979SStefano Zampini 
2144ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2145ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2146ccdfe979SStefano Zampini }
2147ccdfe979SStefano Zampini 
2148ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2149ccdfe979SStefano Zampini 
2150ccdfe979SStefano Zampini /* handles dense B */
2151ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2152ccdfe979SStefano Zampini {
2153ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
2154ccdfe979SStefano Zampini   PetscErrorCode ierr;
2155ccdfe979SStefano Zampini 
2156ccdfe979SStefano Zampini   PetscFunctionBegin;
2157ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2158ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2159ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
2160ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2161ccdfe979SStefano Zampini     PetscFunctionReturn(0);
2162ccdfe979SStefano Zampini   }
2163ccdfe979SStefano Zampini   switch (product->type) {
2164ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2165ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2166ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2167ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2168ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2169ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2170ccdfe979SStefano Zampini   default:
2171ccdfe979SStefano Zampini     break;
2172ccdfe979SStefano Zampini   }
2173ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2174ccdfe979SStefano Zampini }
2175ccdfe979SStefano Zampini 
21766fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
21779ae82921SPaul Mullowney {
2178b175d8bbSPaul Mullowney   PetscErrorCode ierr;
21799ae82921SPaul Mullowney 
21809ae82921SPaul Mullowney   PetscFunctionBegin;
2181e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2182e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2183e6e9a74fSStefano Zampini }
2184e6e9a74fSStefano Zampini 
2185e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2186e6e9a74fSStefano Zampini {
2187e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2188e6e9a74fSStefano Zampini 
2189e6e9a74fSStefano Zampini   PetscFunctionBegin;
2190e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2191e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2192e6e9a74fSStefano Zampini }
2193e6e9a74fSStefano Zampini 
2194e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2195e6e9a74fSStefano Zampini {
2196e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2197e6e9a74fSStefano Zampini 
2198e6e9a74fSStefano Zampini   PetscFunctionBegin;
2199e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2200e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2201e6e9a74fSStefano Zampini }
2202e6e9a74fSStefano Zampini 
2203e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2204e6e9a74fSStefano Zampini {
2205e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2206e6e9a74fSStefano Zampini 
2207e6e9a74fSStefano Zampini   PetscFunctionBegin;
2208e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
22099ae82921SPaul Mullowney   PetscFunctionReturn(0);
22109ae82921SPaul Mullowney }
22119ae82921SPaul Mullowney 
22126fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2213ca45077fSPaul Mullowney {
2214b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2215ca45077fSPaul Mullowney 
2216ca45077fSPaul Mullowney   PetscFunctionBegin;
2217e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2218ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2219ca45077fSPaul Mullowney }
2220ca45077fSPaul Mullowney 
2221afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2222e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
22239ae82921SPaul Mullowney {
22249ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2225aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
22269ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2227e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2228b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
222957d48284SJunchao Zhang   cudaError_t                  cerr;
2230aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2231e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2232e6e9a74fSStefano Zampini   PetscBool                    compressed;
2233afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2234afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2235afb2bd1cSJunchao Zhang #endif
22366e111a19SKarl Rupp 
22379ae82921SPaul Mullowney   PetscFunctionBegin;
2238e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2239e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2240afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2241d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2242e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2243e6e9a74fSStefano Zampini   }
224434d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
224534d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2246e6e9a74fSStefano Zampini   if (!trans) {
22479ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2248c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2249e6e9a74fSStefano Zampini   } else {
2250e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2251e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2252e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2253e6e9a74fSStefano Zampini     } else {
2254afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2255e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2256e6e9a74fSStefano Zampini     }
2257e6e9a74fSStefano Zampini   }
2258e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2259e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2260213423ffSJunchao Zhang 
2261e6e9a74fSStefano Zampini   try {
2262e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2263213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2264213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2265afb2bd1cSJunchao Zhang 
226685ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2267e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2268afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2269afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2270afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2271afb2bd1cSJunchao Zhang       */
2272e6e9a74fSStefano Zampini       xptr = xarray;
2273afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2274213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2275afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2276afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2277afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2278afb2bd1cSJunchao Zhang        */
2279afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2280afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2281afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2282afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2283afb2bd1cSJunchao Zhang       }
2284afb2bd1cSJunchao Zhang      #endif
2285e6e9a74fSStefano Zampini     } else {
2286afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2287afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2288afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2289afb2bd1cSJunchao Zhang        */
2290afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2291e6e9a74fSStefano Zampini       dptr = zarray;
2292e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2293afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2294e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2295e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2296e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2297e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2298e6e9a74fSStefano Zampini       }
2299afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2300afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2301afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2302afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2303afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2304afb2bd1cSJunchao Zhang       }
2305afb2bd1cSJunchao Zhang      #endif
2306e6e9a74fSStefano Zampini     }
23079ae82921SPaul Mullowney 
2308afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2309aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2310afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2311afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2312afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2313afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2314afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2315afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2316afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2317afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2318afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2319afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2320afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2321afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2322afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2323afb2bd1cSJunchao Zhang 
2324afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2325afb2bd1cSJunchao Zhang       } else {
2326afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2327afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2328afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2329afb2bd1cSJunchao Zhang       }
2330afb2bd1cSJunchao Zhang 
2331afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2332afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2333afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2334afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2335afb2bd1cSJunchao Zhang                                beta,
2336afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2337afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2338afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2339afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2340afb2bd1cSJunchao Zhang      #else
23417656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2342e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2343a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2344afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2345aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2346e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
234757d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2348afb2bd1cSJunchao Zhang      #endif
2349aa372e3fSPaul Mullowney     } else {
2350213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2351afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2352afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2353afb2bd1cSJunchao Zhang        #else
2354301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2355e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2356afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2357e6e9a74fSStefano Zampini                                  xptr, beta,
235857d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2359afb2bd1cSJunchao Zhang        #endif
2360a65300a6SPaul Mullowney       }
2361aa372e3fSPaul Mullowney     }
236205035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2363958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2364aa372e3fSPaul Mullowney 
2365e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2366213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2367213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2368213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2369e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2370213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
23717656d835SStefano Zampini         }
2372213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2373c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
23747656d835SStefano Zampini       }
23757656d835SStefano Zampini 
2376213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2377213423ffSJunchao Zhang       if (compressed) {
2378213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2379e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2380c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2381e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2382c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
238305035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2384958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2385e6e9a74fSStefano Zampini       }
2386e6e9a74fSStefano Zampini     } else {
2387e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2388e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2389e6e9a74fSStefano Zampini       }
2390e6e9a74fSStefano Zampini     }
2391e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2392213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2393213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
23949ae82921SPaul Mullowney   } catch(char *ex) {
23959ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
23969ae82921SPaul Mullowney   }
2397e6e9a74fSStefano Zampini   if (yy) {
2398958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2399e6e9a74fSStefano Zampini   } else {
2400e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2401e6e9a74fSStefano Zampini   }
24029ae82921SPaul Mullowney   PetscFunctionReturn(0);
24039ae82921SPaul Mullowney }
24049ae82921SPaul Mullowney 
24056fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2406ca45077fSPaul Mullowney {
2407b175d8bbSPaul Mullowney   PetscErrorCode ierr;
24086e111a19SKarl Rupp 
2409ca45077fSPaul Mullowney   PetscFunctionBegin;
2410e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2411ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2412ca45077fSPaul Mullowney }
2413ca45077fSPaul Mullowney 
24146fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
24159ae82921SPaul Mullowney {
24169ae82921SPaul Mullowney   PetscErrorCode              ierr;
2417*a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
24189ae82921SPaul Mullowney   PetscFunctionBegin;
2419bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
24203fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2421bc3f50f2SPaul Mullowney   }
24223fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
24233fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2424*a587d139SMark   if (d_mat) {
24253fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
24263fa6b06aSMark Adams   }
24273fa6b06aSMark Adams 
24289ae82921SPaul Mullowney   PetscFunctionReturn(0);
24299ae82921SPaul Mullowney }
24309ae82921SPaul Mullowney 
24319ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2432e057df02SPaul Mullowney /*@
24339ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2434e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2435e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2436e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2437e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2438e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
24399ae82921SPaul Mullowney 
2440d083f849SBarry Smith    Collective
24419ae82921SPaul Mullowney 
24429ae82921SPaul Mullowney    Input Parameters:
24439ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
24449ae82921SPaul Mullowney .  m - number of rows
24459ae82921SPaul Mullowney .  n - number of columns
24469ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
24479ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
24480298fd71SBarry Smith          (possibly different for each row) or NULL
24499ae82921SPaul Mullowney 
24509ae82921SPaul Mullowney    Output Parameter:
24519ae82921SPaul Mullowney .  A - the matrix
24529ae82921SPaul Mullowney 
24539ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
24549ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
24559ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
24569ae82921SPaul Mullowney 
24579ae82921SPaul Mullowney    Notes:
24589ae82921SPaul Mullowney    If nnz is given then nz is ignored
24599ae82921SPaul Mullowney 
24609ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
24619ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
24629ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
24639ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
24649ae82921SPaul Mullowney 
24659ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
24660298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
24679ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
24689ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
24699ae82921SPaul Mullowney 
24709ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
24719ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
24729ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
24739ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
24749ae82921SPaul Mullowney 
24759ae82921SPaul Mullowney    Level: intermediate
24769ae82921SPaul Mullowney 
2477e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
24789ae82921SPaul Mullowney @*/
24799ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
24809ae82921SPaul Mullowney {
24819ae82921SPaul Mullowney   PetscErrorCode ierr;
24829ae82921SPaul Mullowney 
24839ae82921SPaul Mullowney   PetscFunctionBegin;
24849ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
24859ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
24869ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
24879ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
24889ae82921SPaul Mullowney   PetscFunctionReturn(0);
24899ae82921SPaul Mullowney }
24909ae82921SPaul Mullowney 
24916fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
24929ae82921SPaul Mullowney {
24939ae82921SPaul Mullowney   PetscErrorCode              ierr;
24943fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
2495ab25e6cbSDominic Meiser 
24969ae82921SPaul Mullowney   PetscFunctionBegin;
24979ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
24983fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
24993fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2500470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
25019ae82921SPaul Mullowney   } else {
2502470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2503aa372e3fSPaul Mullowney   }
25043fa6b06aSMark Adams   if (d_mat) {
25053fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
25063fa6b06aSMark Adams     cudaError_t                err;
25073fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
25083fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
25093fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
25103fa6b06aSMark Adams     if (a->compressedrow.use) {
25113fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
25123fa6b06aSMark Adams     }
25133fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
25143fa6b06aSMark Adams   }
2515ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2516ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2517ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2518ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
25197e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
25207e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
25219ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
25229ae82921SPaul Mullowney   PetscFunctionReturn(0);
25239ae82921SPaul Mullowney }
25249ae82921SPaul Mullowney 
2525ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
252695639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
25279ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
25289ff858a8SKarl Rupp {
25299ff858a8SKarl Rupp   PetscErrorCode ierr;
25309ff858a8SKarl Rupp 
25319ff858a8SKarl Rupp   PetscFunctionBegin;
25329ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2533ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
25349ff858a8SKarl Rupp   PetscFunctionReturn(0);
25359ff858a8SKarl Rupp }
25369ff858a8SKarl Rupp 
2537*a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc.
253895639643SRichard Tran Mills {
2539e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2540*a587d139SMark   Mat_SeqAIJ     *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
2541*a587d139SMark   PetscBool      flgx,flgy;
2542e6e9a74fSStefano Zampini 
254395639643SRichard Tran Mills   PetscFunctionBegin;
2544*a587d139SMark   if (a == (PetscScalar)0.0) PetscFunctionReturn(0);
2545*a587d139SMark   PetscValidHeaderSpecific(Y,MAT_CLASSID,1);
2546*a587d139SMark   PetscValidHeaderSpecific(X,MAT_CLASSID,3);
2547*a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr);
2548*a587d139SMark   ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr);
2549*a587d139SMark   if (!flgx || !flgy) {
2550*a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
2551*a587d139SMark     PetscFunctionReturn(0);
255295639643SRichard Tran Mills   }
2553*a587d139SMark   if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE");
2554*a587d139SMark   if (str == DIFFERENT_NONZERO_PATTERN) {
2555*a587d139SMark     if (x->nz == y->nz) {
2556*a587d139SMark       PetscBool e;
2557*a587d139SMark       ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr);
2558*a587d139SMark       if (e) {
2559*a587d139SMark         ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr);
2560*a587d139SMark         if (e) {
2561*a587d139SMark           str = SAME_NONZERO_PATTERN;
2562*a587d139SMark         }
2563*a587d139SMark       }
2564*a587d139SMark     }
2565*a587d139SMark   }
2566*a587d139SMark   if (str != SAME_NONZERO_PATTERN) {
2567*a587d139SMark     ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr);
2568*a587d139SMark     PetscFunctionReturn(0);
2569*a587d139SMark   } else {
2570*a587d139SMark     Mat_SeqAIJCUSPARSE           *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr;
2571*a587d139SMark     Mat_SeqAIJCUSPARSE           *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr;
2572*a587d139SMark     if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
2573*a587d139SMark     if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
2574*a587d139SMark     if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) {
2575*a587d139SMark       if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) {
2576*a587d139SMark 	ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
2577*a587d139SMark       }
2578*a587d139SMark       if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) {
2579*a587d139SMark 	ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr);
2580*a587d139SMark       }
2581*a587d139SMark       ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2582*a587d139SMark       ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
2583*a587d139SMark       ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
2584*a587d139SMark     } else {
2585*a587d139SMark       cublasHandle_t cublasv2handle;
2586*a587d139SMark       cublasStatus_t cberr;
2587*a587d139SMark       cudaError_t    err;
2588*a587d139SMark       PetscScalar    alpha = a;
2589*a587d139SMark       PetscBLASInt   one = 1, bnz = 1;
2590*a587d139SMark       CsrMatrix      *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat;
2591*a587d139SMark       CsrMatrix      *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat;
2592*a587d139SMark       PetscScalar    *aa_y, *aa_x;
2593*a587d139SMark       aa_y = matrix_y->values->data().get();
2594*a587d139SMark       aa_x = matrix_x->values->data().get();
2595*a587d139SMark       ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2596*a587d139SMark       ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
2597*a587d139SMark       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2598*a587d139SMark       cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr);
2599*a587d139SMark       err  = WaitForCUDA();CHKERRCUDA(err);
2600*a587d139SMark       ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
2601*a587d139SMark       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2602*a587d139SMark       ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
2603*a587d139SMark       ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr);
2604*a587d139SMark       if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU;
2605*a587d139SMark       else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state");
2606*a587d139SMark       ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr);
2607*a587d139SMark     }
2608*a587d139SMark   }
260995639643SRichard Tran Mills   PetscFunctionReturn(0);
261095639643SRichard Tran Mills }
261195639643SRichard Tran Mills 
26123fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
26133fa6b06aSMark Adams {
26143fa6b06aSMark Adams   PetscErrorCode             ierr;
26157e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
2616*a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
26177e8381f9SStefano Zampini 
26183fa6b06aSMark Adams   PetscFunctionBegin;
26193fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
26203fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
26217e8381f9SStefano Zampini     if (spptr->mat) {
26227e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
26237e8381f9SStefano Zampini       if (matrix->values) {
26247e8381f9SStefano Zampini         both = PETSC_TRUE;
26257e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
26267e8381f9SStefano Zampini       }
26277e8381f9SStefano Zampini     }
26287e8381f9SStefano Zampini     if (spptr->matTranspose) {
26297e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
26307e8381f9SStefano Zampini       if (matrix->values) {
26317e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
26327e8381f9SStefano Zampini       }
26337e8381f9SStefano Zampini     }
26343fa6b06aSMark Adams   }
2635*a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
2636*a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
2637*a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
26387e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
2639*a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
26403fa6b06aSMark Adams 
26413fa6b06aSMark Adams   PetscFunctionReturn(0);
26423fa6b06aSMark Adams }
26433fa6b06aSMark Adams 
2644*a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
2645*a587d139SMark {
2646*a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2647*a587d139SMark   PetscErrorCode ierr;
2648*a587d139SMark 
2649*a587d139SMark   PetscFunctionBegin;
2650*a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
2651*a587d139SMark   if (flg) {
2652*a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
2653*a587d139SMark 
2654*a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
2655*a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
2656*a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
2657*a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2658*a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2659*a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2660*a587d139SMark     A->ops->multhermitiantranspose    = NULL;
2661*a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
2662*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2663*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2664*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
2665*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
2666*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
2667*a587d139SMark   } else {
2668*a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
2669*a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
2670*a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
2671*a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
2672*a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
2673*a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2674*a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2675*a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2676*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2677*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2678*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2679*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
2680*a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
2681*a587d139SMark   }
2682*a587d139SMark   A->boundtocpu = flg;
2683*a587d139SMark   a->inode.use = flg;
2684*a587d139SMark   PetscFunctionReturn(0);
2685*a587d139SMark }
2686*a587d139SMark 
268749735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
26889ae82921SPaul Mullowney {
26899ae82921SPaul Mullowney   PetscErrorCode   ierr;
2690aa372e3fSPaul Mullowney   cusparseStatus_t stat;
269149735bf3SStefano Zampini   Mat              B;
26929ae82921SPaul Mullowney 
26939ae82921SPaul Mullowney   PetscFunctionBegin;
269449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
269549735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
269649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
269749735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
269849735bf3SStefano Zampini   }
269949735bf3SStefano Zampini   B = *newmat;
270049735bf3SStefano Zampini 
270134136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
270234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
270334136279SStefano Zampini 
270449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
27059ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2706e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2707e6e9a74fSStefano Zampini 
2708e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2709e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2710e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2711e6e9a74fSStefano Zampini       B->spptr = spptr;
27123fa6b06aSMark Adams       spptr->deviceMat = NULL;
27139ae82921SPaul Mullowney     } else {
2714e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2715e6e9a74fSStefano Zampini 
2716e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2717e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2718e6e9a74fSStefano Zampini       B->spptr = spptr;
27199ae82921SPaul Mullowney     }
2720e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
272149735bf3SStefano Zampini   }
2722693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
27239ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
27249ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
272595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2726693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
27272205254eSKarl Rupp 
2728e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
27299ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2730bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
27319ae82921SPaul Mullowney   PetscFunctionReturn(0);
27329ae82921SPaul Mullowney }
27339ae82921SPaul Mullowney 
273402fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
273502fe1965SBarry Smith {
273602fe1965SBarry Smith   PetscErrorCode ierr;
273702fe1965SBarry Smith 
273802fe1965SBarry Smith   PetscFunctionBegin;
273905035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
274002fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
27410ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2742afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2743afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2744afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
274502fe1965SBarry Smith   PetscFunctionReturn(0);
274602fe1965SBarry Smith }
274702fe1965SBarry Smith 
27483ca39a21SBarry Smith /*MC
2749e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2750e057df02SPaul Mullowney 
2751e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
27522692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
27532692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2754e057df02SPaul Mullowney 
2755e057df02SPaul Mullowney    Options Database Keys:
2756e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2757aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2758a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2759e057df02SPaul Mullowney 
2760e057df02SPaul Mullowney   Level: beginner
2761e057df02SPaul Mullowney 
27628468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2763e057df02SPaul Mullowney M*/
27647f756511SDominic Meiser 
276542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
276642c9c57cSBarry Smith 
27670f39cd5aSBarry Smith 
27683ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
276942c9c57cSBarry Smith {
277042c9c57cSBarry Smith   PetscErrorCode ierr;
277142c9c57cSBarry Smith 
277242c9c57cSBarry Smith   PetscFunctionBegin;
27733ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27743ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27753ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27763ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
277742c9c57cSBarry Smith   PetscFunctionReturn(0);
277842c9c57cSBarry Smith }
277929b38603SBarry Smith 
2780470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
27817f756511SDominic Meiser {
2782e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
27837f756511SDominic Meiser   cusparseStatus_t stat;
27847f756511SDominic Meiser 
27857f756511SDominic Meiser   PetscFunctionBegin;
27867f756511SDominic Meiser   if (*cusparsestruct) {
2787e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2788e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
27897f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
279081902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
27917e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
27927e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
27937e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_v;
27947e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_w;
27957e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2796afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2797afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2798afb2bd1cSJunchao Zhang    #endif
2799e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
28007f756511SDominic Meiser   }
28017f756511SDominic Meiser   PetscFunctionReturn(0);
28027f756511SDominic Meiser }
28037f756511SDominic Meiser 
28047f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
28057f756511SDominic Meiser {
28067f756511SDominic Meiser   PetscFunctionBegin;
28077f756511SDominic Meiser   if (*mat) {
28087f756511SDominic Meiser     delete (*mat)->values;
28097f756511SDominic Meiser     delete (*mat)->column_indices;
28107f756511SDominic Meiser     delete (*mat)->row_offsets;
28117f756511SDominic Meiser     delete *mat;
28127f756511SDominic Meiser     *mat = 0;
28137f756511SDominic Meiser   }
28147f756511SDominic Meiser   PetscFunctionReturn(0);
28157f756511SDominic Meiser }
28167f756511SDominic Meiser 
2817470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
28187f756511SDominic Meiser {
28197f756511SDominic Meiser   cusparseStatus_t stat;
28207f756511SDominic Meiser   PetscErrorCode   ierr;
28217f756511SDominic Meiser 
28227f756511SDominic Meiser   PetscFunctionBegin;
28237f756511SDominic Meiser   if (*trifactor) {
282457d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2825afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
28267f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
28271b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2828afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
28291b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2830afb2bd1cSJunchao Zhang    #endif
2831da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
28327f756511SDominic Meiser   }
28337f756511SDominic Meiser   PetscFunctionReturn(0);
28347f756511SDominic Meiser }
28357f756511SDominic Meiser 
2836470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
28377f756511SDominic Meiser {
28387f756511SDominic Meiser   CsrMatrix        *mat;
28397f756511SDominic Meiser   cusparseStatus_t stat;
28407f756511SDominic Meiser   cudaError_t      err;
28417f756511SDominic Meiser 
28427f756511SDominic Meiser   PetscFunctionBegin;
28437f756511SDominic Meiser   if (*matstruct) {
28447f756511SDominic Meiser     if ((*matstruct)->mat) {
28457f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2846afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2847afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2848afb2bd1cSJunchao Zhang        #else
28497f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
285057d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2851afb2bd1cSJunchao Zhang        #endif
28527f756511SDominic Meiser       } else {
28537f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
28547f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
28557f756511SDominic Meiser       }
28567f756511SDominic Meiser     }
285757d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
28587f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2859afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
28607656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
28617656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2862afb2bd1cSJunchao Zhang 
2863afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2864afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2865afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2866afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
2867afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
2868afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2869afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2870afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2871afb2bd1cSJunchao Zhang       }
2872afb2bd1cSJunchao Zhang     }
2873afb2bd1cSJunchao Zhang    #endif
28747f756511SDominic Meiser     delete *matstruct;
28757e8381f9SStefano Zampini     *matstruct = NULL;
28767f756511SDominic Meiser   }
28777f756511SDominic Meiser   PetscFunctionReturn(0);
28787f756511SDominic Meiser }
28797f756511SDominic Meiser 
2880ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
28817f756511SDominic Meiser {
2882e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2883e6e9a74fSStefano Zampini 
28847f756511SDominic Meiser   PetscFunctionBegin;
28857f756511SDominic Meiser   if (*trifactors) {
2886e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2887e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2888e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2889e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
28907f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
28917f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
28927f756511SDominic Meiser     delete (*trifactors)->workVector;
28937e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
28947e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
28957e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
2896ccdfe979SStefano Zampini   }
2897ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2898ccdfe979SStefano Zampini }
2899ccdfe979SStefano Zampini 
2900ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2901ccdfe979SStefano Zampini {
2902e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2903ccdfe979SStefano Zampini   cusparseHandle_t handle;
2904ccdfe979SStefano Zampini   cusparseStatus_t stat;
2905ccdfe979SStefano Zampini 
2906ccdfe979SStefano Zampini   PetscFunctionBegin;
2907ccdfe979SStefano Zampini   if (*trifactors) {
2908e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
29097f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
291057d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
29117f756511SDominic Meiser     }
2912e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
29137f756511SDominic Meiser   }
29147f756511SDominic Meiser   PetscFunctionReturn(0);
29157f756511SDominic Meiser }
29167e8381f9SStefano Zampini 
29177e8381f9SStefano Zampini struct IJCompare
29187e8381f9SStefano Zampini {
29197e8381f9SStefano Zampini   __host__ __device__
29207e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
29217e8381f9SStefano Zampini   {
29227e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
29237e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
29247e8381f9SStefano Zampini     return false;
29257e8381f9SStefano Zampini   }
29267e8381f9SStefano Zampini };
29277e8381f9SStefano Zampini 
29287e8381f9SStefano Zampini struct IJEqual
29297e8381f9SStefano Zampini {
29307e8381f9SStefano Zampini   __host__ __device__
29317e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
29327e8381f9SStefano Zampini   {
29337e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
29347e8381f9SStefano Zampini     return true;
29357e8381f9SStefano Zampini   }
29367e8381f9SStefano Zampini };
29377e8381f9SStefano Zampini 
29387e8381f9SStefano Zampini struct IJDiff
29397e8381f9SStefano Zampini {
29407e8381f9SStefano Zampini   __host__ __device__
29417e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
29427e8381f9SStefano Zampini   {
29437e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
29447e8381f9SStefano Zampini   }
29457e8381f9SStefano Zampini };
29467e8381f9SStefano Zampini 
29477e8381f9SStefano Zampini struct IJSum
29487e8381f9SStefano Zampini {
29497e8381f9SStefano Zampini   __host__ __device__
29507e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
29517e8381f9SStefano Zampini   {
29527e8381f9SStefano Zampini     return t1||t2;
29537e8381f9SStefano Zampini   }
29547e8381f9SStefano Zampini };
29557e8381f9SStefano Zampini 
29567e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
29577e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
29587e8381f9SStefano Zampini {
29597e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
29607e8381f9SStefano Zampini   CsrMatrix          *matrix;
29617e8381f9SStefano Zampini   PetscErrorCode     ierr;
29627e8381f9SStefano Zampini   cudaError_t        cerr;
29637e8381f9SStefano Zampini   PetscInt           n;
29647e8381f9SStefano Zampini 
29657e8381f9SStefano Zampini   PetscFunctionBegin;
29667e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
29677e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
29687e8381f9SStefano Zampini   if (!cusp->cooPerm) {
29697e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29707e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29717e8381f9SStefano Zampini     PetscFunctionReturn(0);
29727e8381f9SStefano Zampini   }
29737e8381f9SStefano Zampini   n = cusp->cooPerm->size();
29747e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
29757e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
29767e8381f9SStefano Zampini   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
29777e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
29787e8381f9SStefano Zampini   if (v) {
29797e8381f9SStefano Zampini     cusp->cooPerm_v->assign(v,v+n);
29807e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
29817e8381f9SStefano Zampini   }
29827e8381f9SStefano Zampini   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
29837e8381f9SStefano Zampini   if (imode == ADD_VALUES) {
29847e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
29857e8381f9SStefano Zampini       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
29867e8381f9SStefano Zampini       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
29877e8381f9SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
29887e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
29897e8381f9SStefano Zampini     } else {
29907e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
29917e8381f9SStefano Zampini                                                                 matrix->values->begin()));
29927e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
29937e8381f9SStefano Zampini                                                                 matrix->values->end()));
29947e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
29957e8381f9SStefano Zampini     }
29967e8381f9SStefano Zampini   } else {
29977e8381f9SStefano Zampini     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
29987e8381f9SStefano Zampini                               if we are inserting two different values into the same location */
29997e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
30007e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
30017e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
30027e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
30037e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
30047e8381f9SStefano Zampini     } else {
30057e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
30067e8381f9SStefano Zampini                                                                 matrix->values->begin()));
30077e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
30087e8381f9SStefano Zampini                                                                 matrix->values->end()));
30097e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
30107e8381f9SStefano Zampini     }
30117e8381f9SStefano Zampini   }
30127e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
30137e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
30147e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
30157e8381f9SStefano Zampini   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
30167e8381f9SStefano Zampini   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
30177e8381f9SStefano Zampini   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
30187e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
30197e8381f9SStefano Zampini   PetscFunctionReturn(0);
30207e8381f9SStefano Zampini }
30217e8381f9SStefano Zampini 
30227e8381f9SStefano Zampini #include <thrust/binary_search.h>
30237e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
30247e8381f9SStefano Zampini {
30257e8381f9SStefano Zampini   PetscErrorCode     ierr;
30267e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
30277e8381f9SStefano Zampini   CsrMatrix          *matrix;
30287e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
30297e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
30307e8381f9SStefano Zampini   cudaError_t        cerr;
30317e8381f9SStefano Zampini 
30327e8381f9SStefano Zampini   PetscFunctionBegin;
30337e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
30347e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
30357e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
30367e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
30377e8381f9SStefano Zampini   if (n != cooPerm_n) {
30387e8381f9SStefano Zampini     delete cusp->cooPerm;
30397e8381f9SStefano Zampini     delete cusp->cooPerm_v;
30407e8381f9SStefano Zampini     delete cusp->cooPerm_w;
30417e8381f9SStefano Zampini     delete cusp->cooPerm_a;
30427e8381f9SStefano Zampini     cusp->cooPerm = NULL;
30437e8381f9SStefano Zampini     cusp->cooPerm_v = NULL;
30447e8381f9SStefano Zampini     cusp->cooPerm_w = NULL;
30457e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
30467e8381f9SStefano Zampini   }
30477e8381f9SStefano Zampini   if (n) {
30487e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
30497e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
30507e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
30517e8381f9SStefano Zampini 
30527e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
30537e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
30547e8381f9SStefano Zampini 
30557e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
30567e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
30577e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
30587e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
30597e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
30607e8381f9SStefano Zampini 
30617e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
30627e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
30637e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
30647e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
30657e8381f9SStefano Zampini 
30667e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
30677e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
30687e8381f9SStefano Zampini       delete cusp->cooPerm_a;
30697e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
30707e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
30717e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
30727e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
30737e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
30747e8381f9SStefano Zampini       w[0] = 0;
30757e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
30767e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
30777e8381f9SStefano Zampini     }
30787e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
30797e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
30807e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
30817e8381f9SStefano Zampini                         ii.begin());
30827e8381f9SStefano Zampini 
30837e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
30847e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
30857e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
30867e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
30877e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
30887e8381f9SStefano Zampini     a->i[0] = 0;
30897e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
30907e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
30917e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
30927e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
30937e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
30947e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
30957e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
30967e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
30977e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
30987e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
30997e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
31007e8381f9SStefano Zampini     }
31017e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
31027e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
31037e8381f9SStefano Zampini   } else {
31047e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
31057e8381f9SStefano Zampini   }
31067e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
31077e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
31087e8381f9SStefano Zampini 
31097e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
31107e8381f9SStefano Zampini      The code is so convoluted now that I prefer to copy garbage to the GPU */
31117e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
31127e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
31137e8381f9SStefano Zampini   A->nonzerostate++;
31147e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
31157e8381f9SStefano Zampini   {
31167e8381f9SStefano Zampini     matrix = (CsrMatrix*)cusp->mat->mat;
31177e8381f9SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
31187e8381f9SStefano Zampini     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
31197e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
31207e8381f9SStefano Zampini   }
31217e8381f9SStefano Zampini 
31227e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
31237e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
31247e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
31257e8381f9SStefano Zampini   PetscFunctionReturn(0);
31267e8381f9SStefano Zampini }
3127