xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision da79fbbcd3b8328c1bc8d5ae1d52ff9dcd9d7370)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
749ae82921SPaul Mullowney 
757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
817f756511SDominic Meiser 
827e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
837e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
847e8381f9SStefano Zampini 
85b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
86b06137fdSPaul Mullowney {
87b06137fdSPaul Mullowney   cusparseStatus_t   stat;
88b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
89b06137fdSPaul Mullowney 
90b06137fdSPaul Mullowney   PetscFunctionBegin;
91d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
92b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
9357d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
94b06137fdSPaul Mullowney   PetscFunctionReturn(0);
95b06137fdSPaul Mullowney }
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
98b06137fdSPaul Mullowney {
99b06137fdSPaul Mullowney   cusparseStatus_t   stat;
100b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101b06137fdSPaul Mullowney 
102b06137fdSPaul Mullowney   PetscFunctionBegin;
103d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1046b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10516a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
10657d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
10716a2e217SAlejandro Lamas Daviña     }
108b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1096b1cf21dSAlejandro Lamas Daviña   }
11057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111b06137fdSPaul Mullowney   PetscFunctionReturn(0);
112b06137fdSPaul Mullowney }
113b06137fdSPaul Mullowney 
114b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115b06137fdSPaul Mullowney {
116b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1177e8381f9SStefano Zampini   PetscBool          flg;
1187e8381f9SStefano Zampini   PetscErrorCode     ierr;
119ccdfe979SStefano Zampini 
120b06137fdSPaul Mullowney   PetscFunctionBegin;
1217e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
123ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124b06137fdSPaul Mullowney   PetscFunctionReturn(0);
125b06137fdSPaul Mullowney }
126b06137fdSPaul Mullowney 
127ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1289ae82921SPaul Mullowney {
1299ae82921SPaul Mullowney   PetscFunctionBegin;
1309ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1319ae82921SPaul Mullowney   PetscFunctionReturn(0);
1329ae82921SPaul Mullowney }
1339ae82921SPaul Mullowney 
134c708e6cdSJed Brown /*MC
135087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
141c708e6cdSJed Brown 
1429ae82921SPaul Mullowney   Level: beginner
143c708e6cdSJed Brown 
1443ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145c708e6cdSJed Brown M*/
1469ae82921SPaul Mullowney 
14742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1489ae82921SPaul Mullowney {
1499ae82921SPaul Mullowney   PetscErrorCode ierr;
150bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1519ae82921SPaul Mullowney 
1529ae82921SPaul Mullowney   PetscFunctionBegin;
153bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
154bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1552c7c0729SBarry Smith   (*B)->factortype = ftype;
1562c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1579ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1582205254eSKarl Rupp 
159087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16033d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1619ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1629ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1669ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
167bc3f50f2SPaul Mullowney 
168fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1693ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1709ae82921SPaul Mullowney   PetscFunctionReturn(0);
1719ae82921SPaul Mullowney }
1729ae82921SPaul Mullowney 
173bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174ca45077fSPaul Mullowney {
175aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1766e111a19SKarl Rupp 
177ca45077fSPaul Mullowney   PetscFunctionBegin;
178ca45077fSPaul Mullowney   switch (op) {
179e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
180aa372e3fSPaul Mullowney     cusparsestruct->format = format;
181ca45077fSPaul Mullowney     break;
182e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
183aa372e3fSPaul Mullowney     cusparsestruct->format = format;
184ca45077fSPaul Mullowney     break;
185ca45077fSPaul Mullowney   default:
18636d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187ca45077fSPaul Mullowney   }
188ca45077fSPaul Mullowney   PetscFunctionReturn(0);
189ca45077fSPaul Mullowney }
1909ae82921SPaul Mullowney 
191e057df02SPaul Mullowney /*@
192e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
194aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
195e057df02SPaul Mullowney    Not Collective
196e057df02SPaul Mullowney 
197e057df02SPaul Mullowney    Input Parameters:
1988468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
19936d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2002692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
201e057df02SPaul Mullowney 
202e057df02SPaul Mullowney    Output Parameter:
203e057df02SPaul Mullowney 
204e057df02SPaul Mullowney    Level: intermediate
205e057df02SPaul Mullowney 
2068468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207e057df02SPaul Mullowney @*/
208e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209e057df02SPaul Mullowney {
210e057df02SPaul Mullowney   PetscErrorCode ierr;
2116e111a19SKarl Rupp 
212e057df02SPaul Mullowney   PetscFunctionBegin;
213e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
214e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
215e057df02SPaul Mullowney   PetscFunctionReturn(0);
216e057df02SPaul Mullowney }
217e057df02SPaul Mullowney 
218e6e9a74fSStefano Zampini /*@
219e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
220e6e9a74fSStefano Zampini 
221e6e9a74fSStefano Zampini    Collective on mat
222e6e9a74fSStefano Zampini 
223e6e9a74fSStefano Zampini    Input Parameters:
224e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
225e6e9a74fSStefano Zampini -  transgen - the boolean flag
226e6e9a74fSStefano Zampini 
227e6e9a74fSStefano Zampini    Level: intermediate
228e6e9a74fSStefano Zampini 
229e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
230e6e9a74fSStefano Zampini @*/
231e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232e6e9a74fSStefano Zampini {
233e6e9a74fSStefano Zampini   PetscErrorCode ierr;
234e6e9a74fSStefano Zampini   PetscBool      flg;
235e6e9a74fSStefano Zampini 
236e6e9a74fSStefano Zampini   PetscFunctionBegin;
237e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
238e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239e6e9a74fSStefano Zampini   if (flg) {
240e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24154da937aSStefano Zampini 
242e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243e6e9a74fSStefano Zampini     cusp->transgen = transgen;
24454da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
24554da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
24654da937aSStefano Zampini     }
247e6e9a74fSStefano Zampini   }
248e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
249e6e9a74fSStefano Zampini }
250e6e9a74fSStefano Zampini 
2514416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2529ae82921SPaul Mullowney {
2539ae82921SPaul Mullowney   PetscErrorCode           ierr;
254e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2559ae82921SPaul Mullowney   PetscBool                flg;
256a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2576e111a19SKarl Rupp 
2589ae82921SPaul Mullowney   PetscFunctionBegin;
259e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2609ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26154da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26254da937aSStefano Zampini 
26354da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
264afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
265afb2bd1cSJunchao Zhang 
266e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
268afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
269afb2bd1cSJunchao Zhang 
2704c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
273afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
277afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
279afb2bd1cSJunchao Zhang 
280afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
283afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
284afb2bd1cSJunchao Zhang 
285afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
288afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289afb2bd1cSJunchao Zhang    #endif
2904c87dfd4SPaul Mullowney   }
2910af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2929ae82921SPaul Mullowney   PetscFunctionReturn(0);
2939ae82921SPaul Mullowney }
2949ae82921SPaul Mullowney 
2956fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2969ae82921SPaul Mullowney {
297*da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2989ae82921SPaul Mullowney   PetscErrorCode               ierr;
2999ae82921SPaul Mullowney 
3009ae82921SPaul Mullowney   PetscFunctionBegin;
301*da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3029ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3039ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3049ae82921SPaul Mullowney   PetscFunctionReturn(0);
3059ae82921SPaul Mullowney }
3069ae82921SPaul Mullowney 
3076fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3089ae82921SPaul Mullowney {
309*da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3109ae82921SPaul Mullowney   PetscErrorCode               ierr;
3119ae82921SPaul Mullowney 
3129ae82921SPaul Mullowney   PetscFunctionBegin;
313*da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3149ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3159ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3169ae82921SPaul Mullowney   PetscFunctionReturn(0);
3179ae82921SPaul Mullowney }
3189ae82921SPaul Mullowney 
319087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320087f3262SPaul Mullowney {
321*da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
322087f3262SPaul Mullowney   PetscErrorCode               ierr;
323087f3262SPaul Mullowney 
324087f3262SPaul Mullowney   PetscFunctionBegin;
325*da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
326087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
327087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
328087f3262SPaul Mullowney   PetscFunctionReturn(0);
329087f3262SPaul Mullowney }
330087f3262SPaul Mullowney 
331087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
332087f3262SPaul Mullowney {
333*da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
334087f3262SPaul Mullowney   PetscErrorCode               ierr;
335087f3262SPaul Mullowney 
336087f3262SPaul Mullowney   PetscFunctionBegin;
337*da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
338087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
339087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
340087f3262SPaul Mullowney   PetscFunctionReturn(0);
341087f3262SPaul Mullowney }
342087f3262SPaul Mullowney 
343087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3449ae82921SPaul Mullowney {
3459ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3469ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3479ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
348aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3499ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3509ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3519ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3529ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3539ae82921SPaul Mullowney   PetscScalar                       *AALo;
3549ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
355b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
35657d48284SJunchao Zhang   cudaError_t                       cerr;
3579ae82921SPaul Mullowney 
3589ae82921SPaul Mullowney   PetscFunctionBegin;
359cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
360c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3619ae82921SPaul Mullowney     try {
3629ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3639ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
364*da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
365*da79fbbcSStefano Zampini       if (!loTriFactor) {
3669ae82921SPaul Mullowney 
3679ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
36857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
36957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3709ae82921SPaul Mullowney 
3719ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3729ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3739ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3749ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3759ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3769ae82921SPaul Mullowney         v        = aa;
3779ae82921SPaul Mullowney         vi       = aj;
3789ae82921SPaul Mullowney         offset   = 1;
3799ae82921SPaul Mullowney         rowOffset= 1;
3809ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3819ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
382e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3839ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3849ae82921SPaul Mullowney           rowOffset += nz+1;
3859ae82921SPaul Mullowney 
386580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
387580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3889ae82921SPaul Mullowney 
3899ae82921SPaul Mullowney           offset      += nz;
3909ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3919ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3929ae82921SPaul Mullowney           offset      += 1;
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney           v  += nz;
3959ae82921SPaul Mullowney           vi += nz;
3969ae82921SPaul Mullowney         }
3972205254eSKarl Rupp 
398aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
399*da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
400*da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
401aa372e3fSPaul Mullowney         /* Create the matrix description */
40257d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
40357d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4041b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
405afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
406afb2bd1cSJunchao Zhang        #else
40757d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
408afb2bd1cSJunchao Zhang        #endif
40957d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41057d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
411aa372e3fSPaul Mullowney 
412aa372e3fSPaul Mullowney         /* set the operation */
413aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
414aa372e3fSPaul Mullowney 
415aa372e3fSPaul Mullowney         /* set the matrix */
416aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
417aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
418aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
419aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
420aa372e3fSPaul Mullowney 
421aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
422aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
423aa372e3fSPaul Mullowney 
424aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
425aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
426aa372e3fSPaul Mullowney 
427aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
428aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
429aa372e3fSPaul Mullowney 
430afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
431*da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
432afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4331b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
434afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
435afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
436afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
437afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
438afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
439afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
440afb2bd1cSJunchao Zhang       #endif
441afb2bd1cSJunchao Zhang 
442aa372e3fSPaul Mullowney         /* perform the solve analysis */
443aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
444aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
445aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
446afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4471b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
448afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
449afb2bd1cSJunchao Zhang                                #endif
450afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
451*da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
452*da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
453aa372e3fSPaul Mullowney 
454*da79fbbcSStefano Zampini         /* assign the pointer */
455aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4562205254eSKarl Rupp 
45757d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
45857d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4594863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
460*da79fbbcSStefano Zampini       } else { /* update values only */
461*da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
462*da79fbbcSStefano Zampini         AALo[0]  = 1.0;
463*da79fbbcSStefano Zampini         v        = aa;
464*da79fbbcSStefano Zampini         vi       = aj;
465*da79fbbcSStefano Zampini         offset   = 1;
466*da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
467*da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
468*da79fbbcSStefano Zampini           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
469*da79fbbcSStefano Zampini           offset      += nz;
470*da79fbbcSStefano Zampini           AALo[offset] = 1.0;
471*da79fbbcSStefano Zampini           offset      += 1;
472*da79fbbcSStefano Zampini           v  += nz;
473*da79fbbcSStefano Zampini         }
474*da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
475*da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
476*da79fbbcSStefano Zampini       }
477*da79fbbcSStefano Zampini       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
4789ae82921SPaul Mullowney     } catch(char *ex) {
4799ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4809ae82921SPaul Mullowney     }
4819ae82921SPaul Mullowney   }
4829ae82921SPaul Mullowney   PetscFunctionReturn(0);
4839ae82921SPaul Mullowney }
4849ae82921SPaul Mullowney 
485087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4869ae82921SPaul Mullowney {
4879ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4889ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4899ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
490aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4919ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4929ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4939ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4949ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4959ae82921SPaul Mullowney   PetscScalar                       *AAUp;
4969ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4979ae82921SPaul Mullowney   PetscErrorCode                    ierr;
49857d48284SJunchao Zhang   cudaError_t                       cerr;
4999ae82921SPaul Mullowney 
5009ae82921SPaul Mullowney   PetscFunctionBegin;
501cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
502c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5039ae82921SPaul Mullowney     try {
5049ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5059ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
506*da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
507*da79fbbcSStefano Zampini       if (!upTriFactor) {
5089ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
50957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
51057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5119ae82921SPaul Mullowney 
5129ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5139ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5149ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5159ae82921SPaul Mullowney         offset = nzUpper;
5169ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5179ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5189ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5199ae82921SPaul Mullowney 
520e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5219ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5229ae82921SPaul Mullowney 
523e057df02SPaul Mullowney           /* decrement the offset */
5249ae82921SPaul Mullowney           offset -= (nz+1);
5259ae82921SPaul Mullowney 
526e057df02SPaul Mullowney           /* first, set the diagonal elements */
5279ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
52809f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5299ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5309ae82921SPaul Mullowney 
531580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
532580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5339ae82921SPaul Mullowney         }
5342205254eSKarl Rupp 
535aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
536*da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
537*da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5382205254eSKarl Rupp 
539aa372e3fSPaul Mullowney         /* Create the matrix description */
54057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
54157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5421b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
543afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
544afb2bd1cSJunchao Zhang        #else
54557d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
546afb2bd1cSJunchao Zhang        #endif
54757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
54857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
549aa372e3fSPaul Mullowney 
550aa372e3fSPaul Mullowney         /* set the operation */
551aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
552aa372e3fSPaul Mullowney 
553aa372e3fSPaul Mullowney         /* set the matrix */
554aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
555aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
556aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
557aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
558aa372e3fSPaul Mullowney 
559aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
560aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
561aa372e3fSPaul Mullowney 
562aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
563aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
564aa372e3fSPaul Mullowney 
565aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
566aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
567aa372e3fSPaul Mullowney 
568afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
569*da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
570afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5711b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
572afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
573afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
576afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
577afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
578afb2bd1cSJunchao Zhang       #endif
579afb2bd1cSJunchao Zhang 
580aa372e3fSPaul Mullowney         /* perform the solve analysis */
581aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
582aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
583aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
584afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5851b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
586afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
587afb2bd1cSJunchao Zhang                                #endif
588afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
589*da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
590*da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
591aa372e3fSPaul Mullowney 
592*da79fbbcSStefano Zampini         /* assign the pointer */
593aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
5942205254eSKarl Rupp 
59557d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
59657d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
5974863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
598*da79fbbcSStefano Zampini       } else {
599*da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
600*da79fbbcSStefano Zampini         offset = nzUpper;
601*da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
602*da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
603*da79fbbcSStefano Zampini 
604*da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
605*da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
606*da79fbbcSStefano Zampini 
607*da79fbbcSStefano Zampini           /* decrement the offset */
608*da79fbbcSStefano Zampini           offset -= (nz+1);
609*da79fbbcSStefano Zampini 
610*da79fbbcSStefano Zampini           /* first, set the diagonal elements */
611*da79fbbcSStefano Zampini           AAUp[offset] = 1./v[nz];
612*da79fbbcSStefano Zampini           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
613*da79fbbcSStefano Zampini         }
614*da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
615*da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
616*da79fbbcSStefano Zampini       }
617*da79fbbcSStefano Zampini       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
6189ae82921SPaul Mullowney     } catch(char *ex) {
6199ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6209ae82921SPaul Mullowney     }
6219ae82921SPaul Mullowney   }
6229ae82921SPaul Mullowney   PetscFunctionReturn(0);
6239ae82921SPaul Mullowney }
6249ae82921SPaul Mullowney 
625087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6269ae82921SPaul Mullowney {
6279ae82921SPaul Mullowney   PetscErrorCode               ierr;
6289ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6299ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6309ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6319ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6329ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6339ae82921SPaul Mullowney 
6349ae82921SPaul Mullowney   PetscFunctionBegin;
635*da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
636087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
637087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6382205254eSKarl Rupp 
639*da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
640aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6419ae82921SPaul Mullowney 
642c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
643e057df02SPaul Mullowney   /* lower triangular indices */
6449ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
645*da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
646*da79fbbcSStefano Zampini     const PetscInt *r;
647*da79fbbcSStefano Zampini 
648*da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
649aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
650aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6519ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
652*da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
653*da79fbbcSStefano Zampini   }
6549ae82921SPaul Mullowney 
655e057df02SPaul Mullowney   /* upper triangular indices */
6569ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
657*da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
658*da79fbbcSStefano Zampini     const PetscInt *c;
659*da79fbbcSStefano Zampini 
660*da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
661aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
662aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6639ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
664*da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
665*da79fbbcSStefano Zampini   }
6669ae82921SPaul Mullowney   PetscFunctionReturn(0);
6679ae82921SPaul Mullowney }
6689ae82921SPaul Mullowney 
669087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
670087f3262SPaul Mullowney {
671087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
672087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
673aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
674aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
675087f3262SPaul Mullowney   cusparseStatus_t                  stat;
676087f3262SPaul Mullowney   PetscErrorCode                    ierr;
67757d48284SJunchao Zhang   cudaError_t                       cerr;
678087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
679087f3262SPaul Mullowney   PetscScalar                       *AAUp;
680087f3262SPaul Mullowney   PetscScalar                       *AALo;
681087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
682087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
683087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
684087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
685087f3262SPaul Mullowney 
686087f3262SPaul Mullowney   PetscFunctionBegin;
687cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
688c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
689087f3262SPaul Mullowney     try {
690*da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
691*da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
692*da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
693087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
69457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
69557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
696087f3262SPaul Mullowney 
697087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
698087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
699087f3262SPaul Mullowney         AiUp[n]=nzUpper;
700087f3262SPaul Mullowney         offset = 0;
701087f3262SPaul Mullowney         for (i=0; i<n; i++) {
702087f3262SPaul Mullowney           /* set the pointers */
703087f3262SPaul Mullowney           v  = aa + ai[i];
704087f3262SPaul Mullowney           vj = aj + ai[i];
705087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
706087f3262SPaul Mullowney 
707087f3262SPaul Mullowney           /* first, set the diagonal elements */
708087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
70909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
710087f3262SPaul Mullowney           AiUp[i]      = offset;
71109f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
712087f3262SPaul Mullowney 
713087f3262SPaul Mullowney           offset+=1;
714087f3262SPaul Mullowney           if (nz>0) {
715f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
716580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
717087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
718087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
719087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
720087f3262SPaul Mullowney             }
721087f3262SPaul Mullowney             offset+=nz;
722087f3262SPaul Mullowney           }
723087f3262SPaul Mullowney         }
724087f3262SPaul Mullowney 
725aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
726*da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
727*da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
728087f3262SPaul Mullowney 
729aa372e3fSPaul Mullowney         /* Create the matrix description */
73057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
73157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7321b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
733afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
734afb2bd1cSJunchao Zhang        #else
73557d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
736afb2bd1cSJunchao Zhang        #endif
73757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
73857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
739087f3262SPaul Mullowney 
740aa372e3fSPaul Mullowney         /* set the matrix */
741aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
742aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
743aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
744aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
745aa372e3fSPaul Mullowney 
746aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
747aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
748aa372e3fSPaul Mullowney 
749aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
750aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
751aa372e3fSPaul Mullowney 
752aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
753aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
754aa372e3fSPaul Mullowney 
755afb2bd1cSJunchao Zhang         /* set the operation */
756afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
757afb2bd1cSJunchao Zhang 
758afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
759*da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
760afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7611b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
762afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
763afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
764afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
765afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
766afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
767afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
768afb2bd1cSJunchao Zhang       #endif
769afb2bd1cSJunchao Zhang 
770aa372e3fSPaul Mullowney         /* perform the solve analysis */
771aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
772aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
773aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
774afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7751b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
776afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
777afb2bd1cSJunchao Zhang                                 #endif
778afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
779*da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
780*da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
781aa372e3fSPaul Mullowney 
782*da79fbbcSStefano Zampini         /* assign the pointer */
783aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
784aa372e3fSPaul Mullowney 
785aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
786*da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
787*da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
788aa372e3fSPaul Mullowney 
789aa372e3fSPaul Mullowney         /* Create the matrix description */
79057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
79157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7921b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
793afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
794afb2bd1cSJunchao Zhang        #else
79557d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
796afb2bd1cSJunchao Zhang        #endif
79757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
79857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
799aa372e3fSPaul Mullowney 
800aa372e3fSPaul Mullowney         /* set the operation */
801aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
802aa372e3fSPaul Mullowney 
803aa372e3fSPaul Mullowney         /* set the matrix */
804aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
805aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
806aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
807aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
808aa372e3fSPaul Mullowney 
809aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
810aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
811aa372e3fSPaul Mullowney 
812aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
813aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
814aa372e3fSPaul Mullowney 
815aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
816aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
817aa372e3fSPaul Mullowney 
818afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
819*da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
820afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8211b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
822afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
823afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
824afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
825afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
826afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
827afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
828afb2bd1cSJunchao Zhang       #endif
829afb2bd1cSJunchao Zhang 
830aa372e3fSPaul Mullowney         /* perform the solve analysis */
831aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
832aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
833aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
834afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8351b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
836afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
837afb2bd1cSJunchao Zhang                                 #endif
838afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
839*da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
840*da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
841aa372e3fSPaul Mullowney 
842*da79fbbcSStefano Zampini         /* assign the pointer */
843aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
844087f3262SPaul Mullowney 
845*da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
84657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
84757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
848*da79fbbcSStefano Zampini       } else {
849*da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
850*da79fbbcSStefano Zampini         offset = 0;
851*da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
852*da79fbbcSStefano Zampini           /* set the pointers */
853*da79fbbcSStefano Zampini           v  = aa + ai[i];
854*da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
855*da79fbbcSStefano Zampini 
856*da79fbbcSStefano Zampini           /* first, set the diagonal elements */
857*da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
858*da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
859*da79fbbcSStefano Zampini 
860*da79fbbcSStefano Zampini           offset+=1;
861*da79fbbcSStefano Zampini           if (nz>0) {
862*da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
863*da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
864*da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
865*da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
866*da79fbbcSStefano Zampini             }
867*da79fbbcSStefano Zampini             offset+=nz;
868*da79fbbcSStefano Zampini           }
869*da79fbbcSStefano Zampini         }
870*da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
871*da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
872*da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
873*da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
874*da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
875*da79fbbcSStefano Zampini       }
87657d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
87757d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
878087f3262SPaul Mullowney     } catch(char *ex) {
879087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
880087f3262SPaul Mullowney     }
881087f3262SPaul Mullowney   }
882087f3262SPaul Mullowney   PetscFunctionReturn(0);
883087f3262SPaul Mullowney }
884087f3262SPaul Mullowney 
885087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8869ae82921SPaul Mullowney {
8879ae82921SPaul Mullowney   PetscErrorCode               ierr;
888087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
889087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
890087f3262SPaul Mullowney   IS                           ip = a->row;
891087f3262SPaul Mullowney   PetscBool                    perm_identity;
892087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
893087f3262SPaul Mullowney 
894087f3262SPaul Mullowney   PetscFunctionBegin;
895*da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
896087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
897*da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
898aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
899aa372e3fSPaul Mullowney 
900*da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
901*da79fbbcSStefano Zampini 
902087f3262SPaul Mullowney   /* lower triangular indices */
903087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
904087f3262SPaul Mullowney   if (!perm_identity) {
9054e4bbfaaSStefano Zampini     IS             iip;
906*da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9074e4bbfaaSStefano Zampini 
9084e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9094e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
910*da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
911aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
912aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
913aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9144e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9154e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9164e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
917087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
918*da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
919*da79fbbcSStefano Zampini   }
920087f3262SPaul Mullowney   PetscFunctionReturn(0);
921087f3262SPaul Mullowney }
922087f3262SPaul Mullowney 
9236fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9249ae82921SPaul Mullowney {
9259ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9269ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9279ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
928b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9299ae82921SPaul Mullowney 
9309ae82921SPaul Mullowney   PetscFunctionBegin;
9319ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
932ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
933e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9349ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9359ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
936bda325fcSPaul Mullowney   if (row_identity && col_identity) {
937bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
938bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9394e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9404e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
941bda325fcSPaul Mullowney   } else {
942bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
943bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9444e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9454e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
946bda325fcSPaul Mullowney   }
9478dc1d2a3SPaul Mullowney 
948e057df02SPaul Mullowney   /* get the triangular factors */
949087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9509ae82921SPaul Mullowney   PetscFunctionReturn(0);
9519ae82921SPaul Mullowney }
9529ae82921SPaul Mullowney 
953087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
954087f3262SPaul Mullowney {
955087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
956087f3262SPaul Mullowney   IS             ip = b->row;
957087f3262SPaul Mullowney   PetscBool      perm_identity;
958b175d8bbSPaul Mullowney   PetscErrorCode ierr;
959087f3262SPaul Mullowney 
960087f3262SPaul Mullowney   PetscFunctionBegin;
961087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
962ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
963087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
964087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
965087f3262SPaul Mullowney   if (perm_identity) {
966087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
967087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9684e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9694e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
970087f3262SPaul Mullowney   } else {
971087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
972087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9734e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9744e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
975087f3262SPaul Mullowney   }
976087f3262SPaul Mullowney 
977087f3262SPaul Mullowney   /* get the triangular factors */
978087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
979087f3262SPaul Mullowney   PetscFunctionReturn(0);
980087f3262SPaul Mullowney }
9819ae82921SPaul Mullowney 
982b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
983bda325fcSPaul Mullowney {
984bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
985aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
986aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
987*da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
988*da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
989bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
990aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
991aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
992aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
993aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
9941b0a6780SStefano Zampini   cudaError_t                       cerr;
995*da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
996b175d8bbSPaul Mullowney 
997bda325fcSPaul Mullowney   PetscFunctionBegin;
998aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
999*da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1000*da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1001aa372e3fSPaul Mullowney 
1002aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1003aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1004aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1005aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1006aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1007aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1008aa372e3fSPaul Mullowney 
1009aa372e3fSPaul Mullowney   /* Create the matrix description */
101057d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
101157d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
101257d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
101357d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
101457d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1015aa372e3fSPaul Mullowney 
1016aa372e3fSPaul Mullowney   /* set the operation */
1017aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1018aa372e3fSPaul Mullowney 
1019aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1020aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1021afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1022afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1023aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1024afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1025afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1026afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1027aa372e3fSPaul Mullowney 
1028aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1029afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1030afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1031afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1032afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1033afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1034afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1035afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1036afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1037afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1038afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10391b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1040afb2bd1cSJunchao Zhang #endif
1041afb2bd1cSJunchao Zhang 
1042*da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1043aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1044aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1045aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1046aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1047aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1048aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1049afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1050afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1051afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1052afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1053afb2bd1cSJunchao Zhang                         #else
1054afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1055afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1056afb2bd1cSJunchao Zhang                         #endif
1057afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1058*da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1059*da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1060aa372e3fSPaul Mullowney 
1061afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1062*da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1063afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10641b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1065afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1066afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1067afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1068afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1069afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1070afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1071afb2bd1cSJunchao Zhang #endif
1072afb2bd1cSJunchao Zhang 
1073afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1074aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1075afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1076afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1077afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10781b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1079afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1080afb2bd1cSJunchao Zhang                           #endif
1081afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1082*da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1083*da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1084aa372e3fSPaul Mullowney 
1085*da79fbbcSStefano Zampini   /* assign the pointer */
1086aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1087aa372e3fSPaul Mullowney 
1088aa372e3fSPaul Mullowney   /*********************************************/
1089aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1090aa372e3fSPaul Mullowney   /*********************************************/
1091aa372e3fSPaul Mullowney 
1092aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1093*da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1094*da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1095aa372e3fSPaul Mullowney 
1096aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1097aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1098aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1099aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1100aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1101aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1102aa372e3fSPaul Mullowney 
1103aa372e3fSPaul Mullowney   /* Create the matrix description */
110457d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
110557d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
110657d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
110757d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
110857d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1109aa372e3fSPaul Mullowney 
1110aa372e3fSPaul Mullowney   /* set the operation */
1111aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1112aa372e3fSPaul Mullowney 
1113aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1114aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1115afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1116afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1117aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1118afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1119afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1120afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1121aa372e3fSPaul Mullowney 
1122aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1123afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1124afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1125afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1126afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1127afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1128afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1129afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1130afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1131afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1132afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1133afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1134afb2bd1cSJunchao Zhang #endif
1135afb2bd1cSJunchao Zhang 
1136*da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1137aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1138aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1139aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1140aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1141aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1142aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1143afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1145afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1146afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1147afb2bd1cSJunchao Zhang                         #else
1148afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1149afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1150afb2bd1cSJunchao Zhang                         #endif
1151afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1152*da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153*da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1154aa372e3fSPaul Mullowney 
1155afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1156*da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1157afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11581b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1159afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1160afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1161afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1162afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1163afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1164afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1165afb2bd1cSJunchao Zhang   #endif
1166afb2bd1cSJunchao Zhang 
1167afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1168aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1169afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1170afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1171afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11721b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1173afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1174afb2bd1cSJunchao Zhang                           #endif
1175afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1176*da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1177*da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1178aa372e3fSPaul Mullowney 
1179*da79fbbcSStefano Zampini   /* assign the pointer */
1180aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1181bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1182bda325fcSPaul Mullowney }
1183bda325fcSPaul Mullowney 
1184b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1185bda325fcSPaul Mullowney {
1186aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1187aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1188aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1189bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1190bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1191aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1192b06137fdSPaul Mullowney   cudaError_t                  err;
119385ba7357SStefano Zampini   PetscErrorCode               ierr;
1194b175d8bbSPaul Mullowney 
1195bda325fcSPaul Mullowney   PetscFunctionBegin;
119685ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
119785ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
119885ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
119985ba7357SStefano Zampini   /* create cusparse matrix */
1200aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
120157d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1202aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
120357d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
120457d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1205aa372e3fSPaul Mullowney 
1206b06137fdSPaul Mullowney   /* set alpha and beta */
1207afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12087656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12097656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1210afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12117656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12127656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
121357d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1214b06137fdSPaul Mullowney 
1215aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1216aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1217aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1218554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1219554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1220aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1221a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1222aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1223aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1224a3fdcf43SKarl Rupp 
122581902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
122681902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1227afb2bd1cSJunchao Zhang 
122881902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1229afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1230afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1231afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1232afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1233afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1234afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1235afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1236afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1237afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1238afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1239afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1240afb2bd1cSJunchao Zhang    #endif
1241afb2bd1cSJunchao Zhang 
1242a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1243a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1244aa372e3fSPaul Mullowney                             matrix->values->data().get(),
124581902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1246aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1247aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1248afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1249afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1250afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1251afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1252afb2bd1cSJunchao Zhang                           #else
1253afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1254afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1255afb2bd1cSJunchao Zhang                           #endif
1256afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
1257aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1258afb2bd1cSJunchao Zhang 
1259afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1260afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1261afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1262afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1263afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1264afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1265afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1266afb2bd1cSJunchao Zhang    #endif
1267aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1268afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1269afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1270afb2bd1cSJunchao Zhang    #else
1271aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
127251c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
127351c6d536SStefano Zampini     /* First convert HYB to CSR */
1274aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1275aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1276aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1277aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1278aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1279aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1280aa372e3fSPaul Mullowney 
1281aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1282aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1283aa372e3fSPaul Mullowney                             temp->values->data().get(),
1284aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
128557d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1286aa372e3fSPaul Mullowney 
1287aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1288aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1289aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1290aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1291aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1292aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1293aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1294aa372e3fSPaul Mullowney 
1295aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1296aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1297aa372e3fSPaul Mullowney                             temp->values->data().get(),
1298aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1299aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1300aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1301aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1302aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
130357d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1304aa372e3fSPaul Mullowney 
1305aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1306aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
130757d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1308aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1309aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1310aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1311aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1312aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1313aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
131457d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1315aa372e3fSPaul Mullowney 
1316aa372e3fSPaul Mullowney     /* assign the pointer */
1317aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1318aa372e3fSPaul Mullowney     /* delete temporaries */
1319aa372e3fSPaul Mullowney     if (tempT) {
1320aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1321aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1322aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1323aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1324087f3262SPaul Mullowney     }
1325aa372e3fSPaul Mullowney     if (temp) {
1326aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1327aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1328aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1329aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1330aa372e3fSPaul Mullowney     }
1331afb2bd1cSJunchao Zhang    #endif
1332aa372e3fSPaul Mullowney   }
133305035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
133485ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
133585ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1336213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1337213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1338aa372e3fSPaul Mullowney   /* assign the pointer */
1339aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1340bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1341bda325fcSPaul Mullowney }
1342bda325fcSPaul Mullowney 
13434e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13446fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1345bda325fcSPaul Mullowney {
1346c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1347465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1348465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1349465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1350465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1351bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1352bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1353aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1354aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1355aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1356b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
135757d48284SJunchao Zhang   cudaError_t                           cerr;
1358bda325fcSPaul Mullowney 
1359bda325fcSPaul Mullowney   PetscFunctionBegin;
1360aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1361aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1362bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1363aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1364aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1365bda325fcSPaul Mullowney   }
1366bda325fcSPaul Mullowney 
1367bda325fcSPaul Mullowney   /* Get the GPU pointers */
1368c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1369c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1370c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1371c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1372bda325fcSPaul Mullowney 
13737a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1374aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1375c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1376c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1377c41cb2e2SAlejandro Lamas Daviña                xGPU);
1378aa372e3fSPaul Mullowney 
1379aa372e3fSPaul Mullowney   /* First, solve U */
1380aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1381afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13821b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1383afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1384afb2bd1cSJunchao Zhang                       #endif
1385afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1386aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1387aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1388aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1389aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1390afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
13911b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1392afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1393afb2bd1cSJunchao Zhang                       #endif
1394afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1395aa372e3fSPaul Mullowney 
1396aa372e3fSPaul Mullowney   /* Then, solve L */
1397aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1398afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
13991b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1400afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1401afb2bd1cSJunchao Zhang                       #endif
1402afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1403aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1404aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1405aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1406aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1407afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14081b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1409afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1410afb2bd1cSJunchao Zhang                       #endif
1411afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1412aa372e3fSPaul Mullowney 
1413aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1414c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1415c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1416aa372e3fSPaul Mullowney                tempGPU->begin());
1417aa372e3fSPaul Mullowney 
1418aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1419c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1420bda325fcSPaul Mullowney 
1421bda325fcSPaul Mullowney   /* restore */
1422c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1423c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
142405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1425661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1426958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1427bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1428bda325fcSPaul Mullowney }
1429bda325fcSPaul Mullowney 
14306fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1431bda325fcSPaul Mullowney {
1432465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1433465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1434bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1435bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1436aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1437aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1438aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1439b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
144057d48284SJunchao Zhang   cudaError_t                       cerr;
1441bda325fcSPaul Mullowney 
1442bda325fcSPaul Mullowney   PetscFunctionBegin;
1443aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1444aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1445bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1446aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1447aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1448bda325fcSPaul Mullowney   }
1449bda325fcSPaul Mullowney 
1450bda325fcSPaul Mullowney   /* Get the GPU pointers */
1451c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1452c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1453bda325fcSPaul Mullowney 
14547a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1455aa372e3fSPaul Mullowney   /* First, solve U */
1456aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1457afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14581b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1459afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1460afb2bd1cSJunchao Zhang                       #endif
1461afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1462aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1463aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1464aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1465aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1466afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
14671b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1468afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1469afb2bd1cSJunchao Zhang                       #endif
1470afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1471aa372e3fSPaul Mullowney 
1472aa372e3fSPaul Mullowney   /* Then, solve L */
1473aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1474afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14751b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1476afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1477afb2bd1cSJunchao Zhang                       #endif
1478afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1479aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1480aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1481aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1482aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1483afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14841b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1485afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1486afb2bd1cSJunchao Zhang                       #endif
1487afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1488bda325fcSPaul Mullowney 
1489bda325fcSPaul Mullowney   /* restore */
1490c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1491c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
149205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1493661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1494958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1495bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1496bda325fcSPaul Mullowney }
1497bda325fcSPaul Mullowney 
14986fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
14999ae82921SPaul Mullowney {
1500465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1501465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1502465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1503465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15049ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15059ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1506aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1507aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1508aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1509b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
151057d48284SJunchao Zhang   cudaError_t                           cerr;
15119ae82921SPaul Mullowney 
15129ae82921SPaul Mullowney   PetscFunctionBegin;
1513ebc8f436SDominic Meiser 
1514e057df02SPaul Mullowney   /* Get the GPU pointers */
1515c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1516c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1517c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1518c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15199ae82921SPaul Mullowney 
15207a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1521aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1522c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1523c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15244e4bbfaaSStefano Zampini                tempGPU->begin());
1525aa372e3fSPaul Mullowney 
1526aa372e3fSPaul Mullowney   /* Next, solve L */
1527aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1528afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15291b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1530afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1531afb2bd1cSJunchao Zhang                       #endif
1532afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1533aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1534aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1535aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1536aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1537afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15381b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1539afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1540afb2bd1cSJunchao Zhang                       #endif
1541afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1542aa372e3fSPaul Mullowney 
1543aa372e3fSPaul Mullowney   /* Then, solve U */
1544aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1545afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15461b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1547afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1548afb2bd1cSJunchao Zhang                       #endif
1549afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1550aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1551aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1552aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1553aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1554afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
15551b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1556afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1557afb2bd1cSJunchao Zhang                       #endif
1558afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1559aa372e3fSPaul Mullowney 
15604e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
15614e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15624e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15634e4bbfaaSStefano Zampini                xGPU);
15649ae82921SPaul Mullowney 
1565c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1566c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
156705035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1568661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1569958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15709ae82921SPaul Mullowney   PetscFunctionReturn(0);
15719ae82921SPaul Mullowney }
15729ae82921SPaul Mullowney 
15736fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15749ae82921SPaul Mullowney {
1575465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1576465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15779ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15789ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1579aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1580aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1581aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1582b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
158357d48284SJunchao Zhang   cudaError_t                       cerr;
15849ae82921SPaul Mullowney 
15859ae82921SPaul Mullowney   PetscFunctionBegin;
1586e057df02SPaul Mullowney   /* Get the GPU pointers */
1587c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1588c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
15899ae82921SPaul Mullowney 
15907a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1591aa372e3fSPaul Mullowney   /* First, solve L */
1592aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1593afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15941b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1595afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1596afb2bd1cSJunchao Zhang                       #endif
1597afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1598aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1599aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1600aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1601aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1602afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16031b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1604afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1605afb2bd1cSJunchao Zhang                       #endif
1606afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1607aa372e3fSPaul Mullowney 
1608aa372e3fSPaul Mullowney   /* Next, solve U */
1609aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1610afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16111b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1612afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1613afb2bd1cSJunchao Zhang                       #endif
1614afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1615aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1616aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1617aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1618aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1619afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16201b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1621afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1622afb2bd1cSJunchao Zhang                       #endif
1623afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
16249ae82921SPaul Mullowney 
1625c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1626c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
162705035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1628661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1629958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16309ae82921SPaul Mullowney   PetscFunctionReturn(0);
16319ae82921SPaul Mullowney }
16329ae82921SPaul Mullowney 
16337e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16347e8381f9SStefano Zampini {
16357e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16367e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16377e8381f9SStefano Zampini   cudaError_t        cerr;
16387e8381f9SStefano Zampini   PetscErrorCode     ierr;
16397e8381f9SStefano Zampini 
16407e8381f9SStefano Zampini   PetscFunctionBegin;
16417e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16427e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16437e8381f9SStefano Zampini 
16447e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16457e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16467e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16477e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
16487e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16497e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
16507e8381f9SStefano Zampini   }
16517e8381f9SStefano Zampini   PetscFunctionReturn(0);
16527e8381f9SStefano Zampini }
16537e8381f9SStefano Zampini 
16547e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
16557e8381f9SStefano Zampini {
16567e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
16577e8381f9SStefano Zampini   PetscErrorCode ierr;
16587e8381f9SStefano Zampini 
16597e8381f9SStefano Zampini   PetscFunctionBegin;
16607e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
16617e8381f9SStefano Zampini   *array = a->a;
16627e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
16637e8381f9SStefano Zampini   PetscFunctionReturn(0);
16647e8381f9SStefano Zampini }
16657e8381f9SStefano Zampini 
16666fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
16679ae82921SPaul Mullowney {
1668aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
16697c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
16709ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1671213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
16729ae82921SPaul Mullowney   PetscErrorCode               ierr;
1673aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1674b06137fdSPaul Mullowney   cudaError_t                  err;
16759ae82921SPaul Mullowney 
16769ae82921SPaul Mullowney   PetscFunctionBegin;
167795639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1678c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
167981902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
168081902715SJunchao Zhang       /* Copy values only */
1681afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1682afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
168385ba7357SStefano Zampini 
168485ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1685afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
168605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
16874863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
168885ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
168981902715SJunchao Zhang 
169081902715SJunchao Zhang       /* Update matT when it was built before */
169181902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
169281902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1693afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
169485ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
169581902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1696afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1697afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
169881902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1699afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1700afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1701afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1702afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1703afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1704afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1705afb2bd1cSJunchao Zhang                           #else
1706afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1707afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1708afb2bd1cSJunchao Zhang                           #endif
1709afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
171005035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
171185ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
171281902715SJunchao Zhang       }
171334d6c7a5SJose E. Roman     } else {
171485ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17157c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
17167c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
17177c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
171881902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
17199ae82921SPaul Mullowney       try {
17209ae82921SPaul Mullowney         if (a->compressedrow.use) {
17219ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17229ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17239ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17249ae82921SPaul Mullowney         } else {
1725213423ffSJunchao Zhang           m    = A->rmap->n;
1726213423ffSJunchao Zhang           ii   = a->i;
1727e6e9a74fSStefano Zampini           ridx = NULL;
17289ae82921SPaul Mullowney         }
1729213423ffSJunchao Zhang         cusparsestruct->nrows = m;
17309ae82921SPaul Mullowney 
173185ba7357SStefano Zampini         /* create cusparse matrix */
1732aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
173357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
173457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
173557d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17369ae82921SPaul Mullowney 
1737afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17387656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17397656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1740afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17417656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17427656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
174357d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1744b06137fdSPaul Mullowney 
1745aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1746aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1747aa372e3fSPaul Mullowney           /* set the matrix */
1748afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1749afb2bd1cSJunchao Zhang           mat->num_rows = m;
1750afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1751afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1752afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1753afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17549ae82921SPaul Mullowney 
1755afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1756afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1757aa372e3fSPaul Mullowney 
1758afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1759afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1760aa372e3fSPaul Mullowney 
1761aa372e3fSPaul Mullowney           /* assign the pointer */
1762afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1763afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1764afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1765afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1766afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1767afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1768afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1769afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1770afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1771afb2bd1cSJunchao Zhang           }
1772afb2bd1cSJunchao Zhang          #endif
1773aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1774afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1775afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1776afb2bd1cSJunchao Zhang          #else
1777afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1778afb2bd1cSJunchao Zhang           mat->num_rows = m;
1779afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1780afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1781afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1782afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1783aa372e3fSPaul Mullowney 
1784afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1785afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1786aa372e3fSPaul Mullowney 
1787afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1788afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1789aa372e3fSPaul Mullowney 
1790aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
179157d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1792aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1793aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1794afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1795afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1796afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1797afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
179857d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1799aa372e3fSPaul Mullowney           /* assign the pointer */
1800aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1801aa372e3fSPaul Mullowney 
1802afb2bd1cSJunchao Zhang           if (mat) {
1803afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1804afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1805afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1806afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1807087f3262SPaul Mullowney           }
1808afb2bd1cSJunchao Zhang          #endif
1809087f3262SPaul Mullowney         }
1810ca45077fSPaul Mullowney 
1811aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1812213423ffSJunchao Zhang         if (a->compressedrow.use) {
1813213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1814aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1815aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1816213423ffSJunchao Zhang           tmp = m;
1817213423ffSJunchao Zhang         } else {
1818213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1819213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1820213423ffSJunchao Zhang           tmp = 0;
1821213423ffSJunchao Zhang         }
1822213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1823aa372e3fSPaul Mullowney 
1824aa372e3fSPaul Mullowney         /* assign the pointer */
1825aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18269ae82921SPaul Mullowney       } catch(char *ex) {
18279ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18289ae82921SPaul Mullowney       }
182905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
183085ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
183134d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
183234d6c7a5SJose E. Roman     }
1833c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
18349ae82921SPaul Mullowney   }
18359ae82921SPaul Mullowney   PetscFunctionReturn(0);
18369ae82921SPaul Mullowney }
18379ae82921SPaul Mullowney 
1838c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1839aa372e3fSPaul Mullowney {
1840aa372e3fSPaul Mullowney   template <typename Tuple>
1841aa372e3fSPaul Mullowney   __host__ __device__
1842aa372e3fSPaul Mullowney   void operator()(Tuple t)
1843aa372e3fSPaul Mullowney   {
1844aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1845aa372e3fSPaul Mullowney   }
1846aa372e3fSPaul Mullowney };
1847aa372e3fSPaul Mullowney 
18487e8381f9SStefano Zampini struct VecCUDAEquals
18497e8381f9SStefano Zampini {
18507e8381f9SStefano Zampini   template <typename Tuple>
18517e8381f9SStefano Zampini   __host__ __device__
18527e8381f9SStefano Zampini   void operator()(Tuple t)
18537e8381f9SStefano Zampini   {
18547e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18557e8381f9SStefano Zampini   }
18567e8381f9SStefano Zampini };
18577e8381f9SStefano Zampini 
1858e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1859e6e9a74fSStefano Zampini {
1860e6e9a74fSStefano Zampini   template <typename Tuple>
1861e6e9a74fSStefano Zampini   __host__ __device__
1862e6e9a74fSStefano Zampini   void operator()(Tuple t)
1863e6e9a74fSStefano Zampini   {
1864e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1865e6e9a74fSStefano Zampini   }
1866e6e9a74fSStefano Zampini };
1867e6e9a74fSStefano Zampini 
1868afb2bd1cSJunchao Zhang struct MatMatCusparse {
1869ccdfe979SStefano Zampini   PetscBool            cisdense;
1870ccdfe979SStefano Zampini   PetscScalar          *Bt;
1871ccdfe979SStefano Zampini   Mat                  X;
1872afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1873afb2bd1cSJunchao Zhang   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1874afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
1875afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
1876afb2bd1cSJunchao Zhang   size_t               spmmBufferSize;
1877afb2bd1cSJunchao Zhang   void                 *spmmBuffer;
1878afb2bd1cSJunchao Zhang   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1879afb2bd1cSJunchao Zhang #endif
1880afb2bd1cSJunchao Zhang };
1881ccdfe979SStefano Zampini 
1882ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1883ccdfe979SStefano Zampini {
1884ccdfe979SStefano Zampini   PetscErrorCode ierr;
1885ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1886ccdfe979SStefano Zampini   cudaError_t    cerr;
1887ccdfe979SStefano Zampini 
1888ccdfe979SStefano Zampini   PetscFunctionBegin;
1889ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1890afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1891afb2bd1cSJunchao Zhang   cusparseStatus_t stat;
1892afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1893afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1894afb2bd1cSJunchao Zhang   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1895afb2bd1cSJunchao Zhang  #endif
1896ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1897ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1898ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1899ccdfe979SStefano Zampini }
1900ccdfe979SStefano Zampini 
1901ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1902ccdfe979SStefano Zampini 
1903ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1904ccdfe979SStefano Zampini {
1905ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1906ccdfe979SStefano Zampini   Mat                          A,B;
1907afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1908ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1909ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1910ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1911ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1912ccdfe979SStefano Zampini   const PetscScalar            *barray;
1913ccdfe979SStefano Zampini   PetscScalar                  *carray;
1914ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1915ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1916ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1917ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1918afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1919ccdfe979SStefano Zampini 
1920ccdfe979SStefano Zampini   PetscFunctionBegin;
1921ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1922ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1923ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1924ccdfe979SStefano Zampini   A    = product->A;
1925ccdfe979SStefano Zampini   B    = product->B;
1926ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1927ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1928ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1929ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1930ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1931ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1932ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1933ccdfe979SStefano Zampini   switch (product->type) {
1934ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1935ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1936ccdfe979SStefano Zampini     mat = cusp->mat;
1937ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1938ccdfe979SStefano Zampini     m   = A->rmap->n;
1939ccdfe979SStefano Zampini     n   = B->cmap->n;
1940ccdfe979SStefano Zampini     break;
1941ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1942e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1943e6e9a74fSStefano Zampini       mat = cusp->mat;
1944e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1945e6e9a74fSStefano Zampini     } else {
1946ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1947ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1948ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1949e6e9a74fSStefano Zampini     }
1950ccdfe979SStefano Zampini     m = A->cmap->n;
1951ccdfe979SStefano Zampini     n = B->cmap->n;
1952ccdfe979SStefano Zampini     break;
1953ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1954ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1955ccdfe979SStefano Zampini     mat = cusp->mat;
1956ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1957ccdfe979SStefano Zampini     m   = A->rmap->n;
1958ccdfe979SStefano Zampini     n   = B->rmap->n;
1959ccdfe979SStefano Zampini     break;
1960ccdfe979SStefano Zampini   default:
1961ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1962ccdfe979SStefano Zampini   }
1963ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1964ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1965ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1966ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1967afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1968ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1969afb2bd1cSJunchao Zhang 
1970ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1971c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1972c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1973c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1974c8378d12SStefano Zampini   } else {
1975c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1976c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1977c8378d12SStefano Zampini   }
1978c8378d12SStefano Zampini 
1979c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1980afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1981afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1982afb2bd1cSJunchao Zhang   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1983afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1984afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1985afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
1986afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1987afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
1988afb2bd1cSJunchao Zhang     }
1989c8378d12SStefano Zampini 
1990afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1991afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1992afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1993afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
1994afb2bd1cSJunchao Zhang     }
1995afb2bd1cSJunchao Zhang 
1996afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
1997afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
1998afb2bd1cSJunchao Zhang                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
1999afb2bd1cSJunchao Zhang                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2000afb2bd1cSJunchao Zhang                               csrmat->values->data().get(),
2001afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2002afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2003afb2bd1cSJunchao Zhang     }
2004afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2005afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2006afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2007afb2bd1cSJunchao Zhang                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2008afb2bd1cSJunchao Zhang     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2009afb2bd1cSJunchao Zhang     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2010afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2011afb2bd1cSJunchao Zhang   } else {
2012afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2013afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2014afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2015afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2016afb2bd1cSJunchao Zhang   }
2017afb2bd1cSJunchao Zhang 
2018afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2019afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2020afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2021afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2022afb2bd1cSJunchao Zhang                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2023afb2bd1cSJunchao Zhang  #else
2024afb2bd1cSJunchao Zhang   PetscInt k;
2025afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2026ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2027ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2028ccdfe979SStefano Zampini     cublasStatus_t cerr;
2029ccdfe979SStefano Zampini 
2030ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2031ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2032ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2033ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2034ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2035ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2036ccdfe979SStefano Zampini     blda = B->cmap->n;
2037afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2038afb2bd1cSJunchao Zhang   } else {
2039afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2040ccdfe979SStefano Zampini   }
2041ccdfe979SStefano Zampini 
2042afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2043ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2044afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2045ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2046ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2047ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2048ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2049ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2050afb2bd1cSJunchao Zhang  #endif
2051afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2052c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2053c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2054ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2055ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2056ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2057ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2058ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2059ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2060ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2061ccdfe979SStefano Zampini   } else {
2062ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2063ccdfe979SStefano Zampini   }
2064ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2065ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2066ccdfe979SStefano Zampini   }
2067ccdfe979SStefano Zampini   if (!biscuda) {
2068ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2069ccdfe979SStefano Zampini   }
2070ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2071ccdfe979SStefano Zampini }
2072ccdfe979SStefano Zampini 
2073ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2074ccdfe979SStefano Zampini {
2075ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2076ccdfe979SStefano Zampini   Mat                A,B;
2077ccdfe979SStefano Zampini   PetscInt           m,n;
2078ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2079ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2080ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2081ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2082ccdfe979SStefano Zampini 
2083ccdfe979SStefano Zampini   PetscFunctionBegin;
2084ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2085ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2086ccdfe979SStefano Zampini   A    = product->A;
2087ccdfe979SStefano Zampini   B    = product->B;
2088ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2089ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2090ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2091ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2092ccdfe979SStefano Zampini   switch (product->type) {
2093ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2094ccdfe979SStefano Zampini     m = A->rmap->n;
2095ccdfe979SStefano Zampini     n = B->cmap->n;
2096ccdfe979SStefano Zampini     break;
2097ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2098ccdfe979SStefano Zampini     m = A->cmap->n;
2099ccdfe979SStefano Zampini     n = B->cmap->n;
2100ccdfe979SStefano Zampini     break;
2101ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2102ccdfe979SStefano Zampini     m = A->rmap->n;
2103ccdfe979SStefano Zampini     n = B->rmap->n;
2104ccdfe979SStefano Zampini     break;
2105ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2106ccdfe979SStefano Zampini     m = B->cmap->n;
2107ccdfe979SStefano Zampini     n = B->cmap->n;
2108ccdfe979SStefano Zampini     break;
2109ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2110ccdfe979SStefano Zampini     m = B->rmap->n;
2111ccdfe979SStefano Zampini     n = B->rmap->n;
2112ccdfe979SStefano Zampini     break;
2113ccdfe979SStefano Zampini   default:
2114ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2115ccdfe979SStefano Zampini   }
2116ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2117ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2118ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2119ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2120ccdfe979SStefano Zampini 
2121ccdfe979SStefano Zampini   /* product data */
2122ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2123ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2124afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2125afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2126ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2127afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2128ccdfe979SStefano Zampini   }
2129afb2bd1cSJunchao Zhang  #endif
2130ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2131ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2132ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2133ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2134ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2135ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2136ccdfe979SStefano Zampini     } else {
2137ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2138ccdfe979SStefano Zampini     }
2139ccdfe979SStefano Zampini   }
2140ccdfe979SStefano Zampini   C->product->data    = mmdata;
2141ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2142ccdfe979SStefano Zampini 
2143ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2144ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2145ccdfe979SStefano Zampini }
2146ccdfe979SStefano Zampini 
2147ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2148ccdfe979SStefano Zampini 
2149ccdfe979SStefano Zampini /* handles dense B */
2150ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2151ccdfe979SStefano Zampini {
2152ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
2153ccdfe979SStefano Zampini   PetscErrorCode ierr;
2154ccdfe979SStefano Zampini 
2155ccdfe979SStefano Zampini   PetscFunctionBegin;
2156ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2157ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2158ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
2159ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2160ccdfe979SStefano Zampini     PetscFunctionReturn(0);
2161ccdfe979SStefano Zampini   }
2162ccdfe979SStefano Zampini   switch (product->type) {
2163ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2164ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2165ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2166ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2167ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2168ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2169ccdfe979SStefano Zampini   default:
2170ccdfe979SStefano Zampini     break;
2171ccdfe979SStefano Zampini   }
2172ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2173ccdfe979SStefano Zampini }
2174ccdfe979SStefano Zampini 
21756fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
21769ae82921SPaul Mullowney {
2177b175d8bbSPaul Mullowney   PetscErrorCode ierr;
21789ae82921SPaul Mullowney 
21799ae82921SPaul Mullowney   PetscFunctionBegin;
2180e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2181e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2182e6e9a74fSStefano Zampini }
2183e6e9a74fSStefano Zampini 
2184e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2185e6e9a74fSStefano Zampini {
2186e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2187e6e9a74fSStefano Zampini 
2188e6e9a74fSStefano Zampini   PetscFunctionBegin;
2189e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2190e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2191e6e9a74fSStefano Zampini }
2192e6e9a74fSStefano Zampini 
2193e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2194e6e9a74fSStefano Zampini {
2195e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2196e6e9a74fSStefano Zampini 
2197e6e9a74fSStefano Zampini   PetscFunctionBegin;
2198e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2199e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2200e6e9a74fSStefano Zampini }
2201e6e9a74fSStefano Zampini 
2202e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2203e6e9a74fSStefano Zampini {
2204e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2205e6e9a74fSStefano Zampini 
2206e6e9a74fSStefano Zampini   PetscFunctionBegin;
2207e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
22089ae82921SPaul Mullowney   PetscFunctionReturn(0);
22099ae82921SPaul Mullowney }
22109ae82921SPaul Mullowney 
22116fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2212ca45077fSPaul Mullowney {
2213b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2214ca45077fSPaul Mullowney 
2215ca45077fSPaul Mullowney   PetscFunctionBegin;
2216e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2217ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2218ca45077fSPaul Mullowney }
2219ca45077fSPaul Mullowney 
2220afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2221e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
22229ae82921SPaul Mullowney {
22239ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2224aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
22259ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2226e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2227b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
222857d48284SJunchao Zhang   cudaError_t                  cerr;
2229aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2230e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2231e6e9a74fSStefano Zampini   PetscBool                    compressed;
2232afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2233afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2234afb2bd1cSJunchao Zhang #endif
22356e111a19SKarl Rupp 
22369ae82921SPaul Mullowney   PetscFunctionBegin;
2237e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2238e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2239afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2240d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2241e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2242e6e9a74fSStefano Zampini   }
224334d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
224434d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2245e6e9a74fSStefano Zampini   if (!trans) {
22469ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2247c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2248e6e9a74fSStefano Zampini   } else {
2249e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2250e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2251e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2252e6e9a74fSStefano Zampini     } else {
2253afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2254e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2255e6e9a74fSStefano Zampini     }
2256e6e9a74fSStefano Zampini   }
2257e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2258e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2259213423ffSJunchao Zhang 
2260e6e9a74fSStefano Zampini   try {
2261e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2262213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2263213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2264afb2bd1cSJunchao Zhang 
226585ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2266e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2267afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2268afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2269afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2270afb2bd1cSJunchao Zhang       */
2271e6e9a74fSStefano Zampini       xptr = xarray;
2272afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2273213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2274afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2275afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2276afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2277afb2bd1cSJunchao Zhang        */
2278afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2279afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2280afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2281afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2282afb2bd1cSJunchao Zhang       }
2283afb2bd1cSJunchao Zhang      #endif
2284e6e9a74fSStefano Zampini     } else {
2285afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2286afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2287afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2288afb2bd1cSJunchao Zhang        */
2289afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2290e6e9a74fSStefano Zampini       dptr = zarray;
2291e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2292afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2293e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2294e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2295e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2296e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2297e6e9a74fSStefano Zampini       }
2298afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2299afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2300afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2301afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2302afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2303afb2bd1cSJunchao Zhang       }
2304afb2bd1cSJunchao Zhang      #endif
2305e6e9a74fSStefano Zampini     }
23069ae82921SPaul Mullowney 
2307afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2308aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2309afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2310afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2311afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2312afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2313afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2314afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2315afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2316afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2317afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2318afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2319afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2320afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2321afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2322afb2bd1cSJunchao Zhang 
2323afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2324afb2bd1cSJunchao Zhang       } else {
2325afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2326afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2327afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2328afb2bd1cSJunchao Zhang       }
2329afb2bd1cSJunchao Zhang 
2330afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2331afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2332afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2333afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2334afb2bd1cSJunchao Zhang                                beta,
2335afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2336afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2337afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2338afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2339afb2bd1cSJunchao Zhang      #else
23407656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2341e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2342a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2343afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2344aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2345e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
234657d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2347afb2bd1cSJunchao Zhang      #endif
2348aa372e3fSPaul Mullowney     } else {
2349213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2350afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2351afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2352afb2bd1cSJunchao Zhang        #else
2353301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2354e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2355afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2356e6e9a74fSStefano Zampini                                  xptr, beta,
235757d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2358afb2bd1cSJunchao Zhang        #endif
2359a65300a6SPaul Mullowney       }
2360aa372e3fSPaul Mullowney     }
236105035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2362958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2363aa372e3fSPaul Mullowney 
2364e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2365213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2366213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2367213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2368e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2369213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
23707656d835SStefano Zampini         }
2371213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2372c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
23737656d835SStefano Zampini       }
23747656d835SStefano Zampini 
2375213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2376213423ffSJunchao Zhang       if (compressed) {
2377213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2378e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2379c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2380e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2381c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
238205035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2383958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2384e6e9a74fSStefano Zampini       }
2385e6e9a74fSStefano Zampini     } else {
2386e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2387e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2388e6e9a74fSStefano Zampini       }
2389e6e9a74fSStefano Zampini     }
2390e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2391213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2392213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
23939ae82921SPaul Mullowney   } catch(char *ex) {
23949ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
23959ae82921SPaul Mullowney   }
2396e6e9a74fSStefano Zampini   if (yy) {
2397958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2398e6e9a74fSStefano Zampini   } else {
2399e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2400e6e9a74fSStefano Zampini   }
24019ae82921SPaul Mullowney   PetscFunctionReturn(0);
24029ae82921SPaul Mullowney }
24039ae82921SPaul Mullowney 
24046fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2405ca45077fSPaul Mullowney {
2406b175d8bbSPaul Mullowney   PetscErrorCode ierr;
24076e111a19SKarl Rupp 
2408ca45077fSPaul Mullowney   PetscFunctionBegin;
2409e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2410ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2411ca45077fSPaul Mullowney }
2412ca45077fSPaul Mullowney 
24136fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
24149ae82921SPaul Mullowney {
24159ae82921SPaul Mullowney   PetscErrorCode              ierr;
24163fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
24173fa6b06aSMark Adams   PetscBool                   is_seq = PETSC_TRUE;
24183fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
24199ae82921SPaul Mullowney   PetscFunctionBegin;
2420bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
24213fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2422bc3f50f2SPaul Mullowney   }
24233fa6b06aSMark Adams   if (d_mat) {
24243fa6b06aSMark Adams     cudaError_t err;
24253fa6b06aSMark Adams     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
24263fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
24273fa6b06aSMark Adams     nnz_state = h_mat.nonzerostate;
24283fa6b06aSMark Adams     is_seq = h_mat.seq;
24293fa6b06aSMark Adams   }
24303fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
24313fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
24323fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
24333fa6b06aSMark Adams     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
24343fa6b06aSMark Adams   } else if (nnz_state > A->nonzerostate) {
24353fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
24363fa6b06aSMark Adams   }
24373fa6b06aSMark Adams 
24389ae82921SPaul Mullowney   PetscFunctionReturn(0);
24399ae82921SPaul Mullowney }
24409ae82921SPaul Mullowney 
24419ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2442e057df02SPaul Mullowney /*@
24439ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2444e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2445e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2446e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2447e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2448e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
24499ae82921SPaul Mullowney 
2450d083f849SBarry Smith    Collective
24519ae82921SPaul Mullowney 
24529ae82921SPaul Mullowney    Input Parameters:
24539ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
24549ae82921SPaul Mullowney .  m - number of rows
24559ae82921SPaul Mullowney .  n - number of columns
24569ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
24579ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
24580298fd71SBarry Smith          (possibly different for each row) or NULL
24599ae82921SPaul Mullowney 
24609ae82921SPaul Mullowney    Output Parameter:
24619ae82921SPaul Mullowney .  A - the matrix
24629ae82921SPaul Mullowney 
24639ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
24649ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
24659ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
24669ae82921SPaul Mullowney 
24679ae82921SPaul Mullowney    Notes:
24689ae82921SPaul Mullowney    If nnz is given then nz is ignored
24699ae82921SPaul Mullowney 
24709ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
24719ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
24729ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
24739ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
24749ae82921SPaul Mullowney 
24759ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
24760298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
24779ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
24789ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
24799ae82921SPaul Mullowney 
24809ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
24819ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
24829ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
24839ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
24849ae82921SPaul Mullowney 
24859ae82921SPaul Mullowney    Level: intermediate
24869ae82921SPaul Mullowney 
2487e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
24889ae82921SPaul Mullowney @*/
24899ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
24909ae82921SPaul Mullowney {
24919ae82921SPaul Mullowney   PetscErrorCode ierr;
24929ae82921SPaul Mullowney 
24939ae82921SPaul Mullowney   PetscFunctionBegin;
24949ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
24959ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
24969ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
24979ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
24989ae82921SPaul Mullowney   PetscFunctionReturn(0);
24999ae82921SPaul Mullowney }
25009ae82921SPaul Mullowney 
25016fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
25029ae82921SPaul Mullowney {
25039ae82921SPaul Mullowney   PetscErrorCode              ierr;
25043fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
2505ab25e6cbSDominic Meiser 
25069ae82921SPaul Mullowney   PetscFunctionBegin;
25079ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
25083fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
25093fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2510470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
25119ae82921SPaul Mullowney   } else {
2512470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2513aa372e3fSPaul Mullowney   }
25143fa6b06aSMark Adams   if (d_mat) {
25153fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
25163fa6b06aSMark Adams     cudaError_t                err;
25173fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
25183fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
25193fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
25203fa6b06aSMark Adams     if (h_mat.seq) {
25213fa6b06aSMark Adams       if (a->compressedrow.use) {
25223fa6b06aSMark Adams  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
25233fa6b06aSMark Adams       }
25243fa6b06aSMark Adams       err = cudaFree(d_mat);CHKERRCUDA(err);
25253fa6b06aSMark Adams     }
25263fa6b06aSMark Adams   }
2527ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2528ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2529ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2530ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
25317e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
25327e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
25339ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
25349ae82921SPaul Mullowney   PetscFunctionReturn(0);
25359ae82921SPaul Mullowney }
25369ae82921SPaul Mullowney 
2537ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
253895639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
25399ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
25409ff858a8SKarl Rupp {
25419ff858a8SKarl Rupp   PetscErrorCode ierr;
25429ff858a8SKarl Rupp 
25439ff858a8SKarl Rupp   PetscFunctionBegin;
25449ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2545ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
25469ff858a8SKarl Rupp   PetscFunctionReturn(0);
25479ff858a8SKarl Rupp }
25489ff858a8SKarl Rupp 
254995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
255095639643SRichard Tran Mills {
2551c58ef05eSStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2552e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2553e6e9a74fSStefano Zampini 
255495639643SRichard Tran Mills   PetscFunctionBegin;
2555e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
255695639643SRichard Tran Mills   if (flg) {
25577e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
25587e8381f9SStefano Zampini 
255995639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
256095639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2561c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2562c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2563e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
2564e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
2565e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2566e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
25677e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
25687e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
25697e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
257095639643SRichard Tran Mills   } else {
257195639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
257295639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
257395639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
257495639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2575e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2576e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2577e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2578e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
25797e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
25807e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
25817e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
258295639643SRichard Tran Mills   }
258395639643SRichard Tran Mills   A->boundtocpu = flg;
2584c58ef05eSStefano Zampini   a->inode.use = flg;
258595639643SRichard Tran Mills   PetscFunctionReturn(0);
258695639643SRichard Tran Mills }
258795639643SRichard Tran Mills 
25883fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
25893fa6b06aSMark Adams {
25903fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = NULL;
25913fa6b06aSMark Adams   PetscErrorCode             ierr;
25927e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
25937e8381f9SStefano Zampini 
25943fa6b06aSMark Adams   PetscFunctionBegin;
25953fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
25963fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
25977e8381f9SStefano Zampini     if (spptr->mat) {
25987e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
25997e8381f9SStefano Zampini       if (matrix->values) {
26007e8381f9SStefano Zampini         both = PETSC_TRUE;
26017e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
26027e8381f9SStefano Zampini       }
26037e8381f9SStefano Zampini     }
26047e8381f9SStefano Zampini     if (spptr->matTranspose) {
26057e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
26067e8381f9SStefano Zampini       if (matrix->values) {
26077e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
26087e8381f9SStefano Zampini       }
26097e8381f9SStefano Zampini     }
26103fa6b06aSMark Adams     d_mat = spptr->deviceMat;
26113fa6b06aSMark Adams   }
26123fa6b06aSMark Adams   if (d_mat) {
26133fa6b06aSMark Adams     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
26143fa6b06aSMark Adams     PetscInt     n = A->rmap->n, nnz = a->i[n];
26153fa6b06aSMark Adams     cudaError_t  err;
26163fa6b06aSMark Adams     PetscScalar  *vals;
26173fa6b06aSMark Adams     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
26183fa6b06aSMark Adams     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
26193fa6b06aSMark Adams     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
26203fa6b06aSMark Adams   }
26213fa6b06aSMark Adams   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
26227e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
26233fa6b06aSMark Adams 
26243fa6b06aSMark Adams   PetscFunctionReturn(0);
26253fa6b06aSMark Adams }
26263fa6b06aSMark Adams 
262749735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
26289ae82921SPaul Mullowney {
26299ae82921SPaul Mullowney   PetscErrorCode   ierr;
2630aa372e3fSPaul Mullowney   cusparseStatus_t stat;
263149735bf3SStefano Zampini   Mat              B;
26329ae82921SPaul Mullowney 
26339ae82921SPaul Mullowney   PetscFunctionBegin;
263449735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
263549735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
263649735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
263749735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
263849735bf3SStefano Zampini   }
263949735bf3SStefano Zampini   B = *newmat;
264049735bf3SStefano Zampini 
264134136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
264234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
264334136279SStefano Zampini 
264449735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
26459ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2646e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2647e6e9a74fSStefano Zampini 
2648e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2649e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2650e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2651e6e9a74fSStefano Zampini       B->spptr = spptr;
26523fa6b06aSMark Adams       spptr->deviceMat = NULL;
26539ae82921SPaul Mullowney     } else {
2654e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2655e6e9a74fSStefano Zampini 
2656e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2657e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2658e6e9a74fSStefano Zampini       B->spptr = spptr;
26599ae82921SPaul Mullowney     }
2660e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
266149735bf3SStefano Zampini   }
2662693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
26639ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
26649ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
266595639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2666693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
26673fa6b06aSMark Adams   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
26682205254eSKarl Rupp 
2669e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
26709ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2671bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
26729ae82921SPaul Mullowney   PetscFunctionReturn(0);
26739ae82921SPaul Mullowney }
26749ae82921SPaul Mullowney 
267502fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
267602fe1965SBarry Smith {
267702fe1965SBarry Smith   PetscErrorCode ierr;
267802fe1965SBarry Smith 
267902fe1965SBarry Smith   PetscFunctionBegin;
268005035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
268102fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
26820ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2683afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2684afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2685afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
268602fe1965SBarry Smith   PetscFunctionReturn(0);
268702fe1965SBarry Smith }
268802fe1965SBarry Smith 
26893ca39a21SBarry Smith /*MC
2690e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2691e057df02SPaul Mullowney 
2692e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
26932692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
26942692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2695e057df02SPaul Mullowney 
2696e057df02SPaul Mullowney    Options Database Keys:
2697e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2698aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2699a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2700e057df02SPaul Mullowney 
2701e057df02SPaul Mullowney   Level: beginner
2702e057df02SPaul Mullowney 
27038468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2704e057df02SPaul Mullowney M*/
27057f756511SDominic Meiser 
270642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
270742c9c57cSBarry Smith 
27080f39cd5aSBarry Smith 
27093ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
271042c9c57cSBarry Smith {
271142c9c57cSBarry Smith   PetscErrorCode ierr;
271242c9c57cSBarry Smith 
271342c9c57cSBarry Smith   PetscFunctionBegin;
27143ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27153ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27163ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27173ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
271842c9c57cSBarry Smith   PetscFunctionReturn(0);
271942c9c57cSBarry Smith }
272029b38603SBarry Smith 
2721470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
27227f756511SDominic Meiser {
2723e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
27247f756511SDominic Meiser   cusparseStatus_t stat;
27257f756511SDominic Meiser 
27267f756511SDominic Meiser   PetscFunctionBegin;
27277f756511SDominic Meiser   if (*cusparsestruct) {
2728e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2729e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
27307f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
273181902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
27327e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
27337e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
27347e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_v;
27357e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_w;
27367e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2737afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2738afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2739afb2bd1cSJunchao Zhang    #endif
2740e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
27417f756511SDominic Meiser   }
27427f756511SDominic Meiser   PetscFunctionReturn(0);
27437f756511SDominic Meiser }
27447f756511SDominic Meiser 
27457f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
27467f756511SDominic Meiser {
27477f756511SDominic Meiser   PetscFunctionBegin;
27487f756511SDominic Meiser   if (*mat) {
27497f756511SDominic Meiser     delete (*mat)->values;
27507f756511SDominic Meiser     delete (*mat)->column_indices;
27517f756511SDominic Meiser     delete (*mat)->row_offsets;
27527f756511SDominic Meiser     delete *mat;
27537f756511SDominic Meiser     *mat = 0;
27547f756511SDominic Meiser   }
27557f756511SDominic Meiser   PetscFunctionReturn(0);
27567f756511SDominic Meiser }
27577f756511SDominic Meiser 
2758470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
27597f756511SDominic Meiser {
27607f756511SDominic Meiser   cusparseStatus_t stat;
27617f756511SDominic Meiser   PetscErrorCode   ierr;
27627f756511SDominic Meiser 
27637f756511SDominic Meiser   PetscFunctionBegin;
27647f756511SDominic Meiser   if (*trifactor) {
276557d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2766afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
27677f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
27681b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2769afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
27701b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2771afb2bd1cSJunchao Zhang    #endif
2772*da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
27737f756511SDominic Meiser   }
27747f756511SDominic Meiser   PetscFunctionReturn(0);
27757f756511SDominic Meiser }
27767f756511SDominic Meiser 
2777470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
27787f756511SDominic Meiser {
27797f756511SDominic Meiser   CsrMatrix        *mat;
27807f756511SDominic Meiser   cusparseStatus_t stat;
27817f756511SDominic Meiser   cudaError_t      err;
27827f756511SDominic Meiser 
27837f756511SDominic Meiser   PetscFunctionBegin;
27847f756511SDominic Meiser   if (*matstruct) {
27857f756511SDominic Meiser     if ((*matstruct)->mat) {
27867f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2787afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2788afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2789afb2bd1cSJunchao Zhang        #else
27907f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
279157d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2792afb2bd1cSJunchao Zhang        #endif
27937f756511SDominic Meiser       } else {
27947f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
27957f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
27967f756511SDominic Meiser       }
27977f756511SDominic Meiser     }
279857d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
27997f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2800afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
28017656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
28027656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2803afb2bd1cSJunchao Zhang 
2804afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2805afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2806afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2807afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
2808afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
2809afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2810afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2811afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2812afb2bd1cSJunchao Zhang       }
2813afb2bd1cSJunchao Zhang     }
2814afb2bd1cSJunchao Zhang    #endif
28157f756511SDominic Meiser     delete *matstruct;
28167e8381f9SStefano Zampini     *matstruct = NULL;
28177f756511SDominic Meiser   }
28187f756511SDominic Meiser   PetscFunctionReturn(0);
28197f756511SDominic Meiser }
28207f756511SDominic Meiser 
2821ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
28227f756511SDominic Meiser {
2823e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2824e6e9a74fSStefano Zampini 
28257f756511SDominic Meiser   PetscFunctionBegin;
28267f756511SDominic Meiser   if (*trifactors) {
2827e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2828e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2829e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2830e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
28317f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
28327f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
28337f756511SDominic Meiser     delete (*trifactors)->workVector;
28347e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
28357e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
28367e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
2837ccdfe979SStefano Zampini   }
2838ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2839ccdfe979SStefano Zampini }
2840ccdfe979SStefano Zampini 
2841ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2842ccdfe979SStefano Zampini {
2843e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2844ccdfe979SStefano Zampini   cusparseHandle_t handle;
2845ccdfe979SStefano Zampini   cusparseStatus_t stat;
2846ccdfe979SStefano Zampini 
2847ccdfe979SStefano Zampini   PetscFunctionBegin;
2848ccdfe979SStefano Zampini   if (*trifactors) {
2849e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
28507f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
285157d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
28527f756511SDominic Meiser     }
2853e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
28547f756511SDominic Meiser   }
28557f756511SDominic Meiser   PetscFunctionReturn(0);
28567f756511SDominic Meiser }
28577e8381f9SStefano Zampini 
28587e8381f9SStefano Zampini struct IJCompare
28597e8381f9SStefano Zampini {
28607e8381f9SStefano Zampini   __host__ __device__
28617e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
28627e8381f9SStefano Zampini   {
28637e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
28647e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
28657e8381f9SStefano Zampini     return false;
28667e8381f9SStefano Zampini   }
28677e8381f9SStefano Zampini };
28687e8381f9SStefano Zampini 
28697e8381f9SStefano Zampini struct IJEqual
28707e8381f9SStefano Zampini {
28717e8381f9SStefano Zampini   __host__ __device__
28727e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
28737e8381f9SStefano Zampini   {
28747e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
28757e8381f9SStefano Zampini     return true;
28767e8381f9SStefano Zampini   }
28777e8381f9SStefano Zampini };
28787e8381f9SStefano Zampini 
28797e8381f9SStefano Zampini struct IJDiff
28807e8381f9SStefano Zampini {
28817e8381f9SStefano Zampini   __host__ __device__
28827e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
28837e8381f9SStefano Zampini   {
28847e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
28857e8381f9SStefano Zampini   }
28867e8381f9SStefano Zampini };
28877e8381f9SStefano Zampini 
28887e8381f9SStefano Zampini struct IJSum
28897e8381f9SStefano Zampini {
28907e8381f9SStefano Zampini   __host__ __device__
28917e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
28927e8381f9SStefano Zampini   {
28937e8381f9SStefano Zampini     return t1||t2;
28947e8381f9SStefano Zampini   }
28957e8381f9SStefano Zampini };
28967e8381f9SStefano Zampini 
28977e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
28987e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
28997e8381f9SStefano Zampini {
29007e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
29017e8381f9SStefano Zampini   CsrMatrix          *matrix;
29027e8381f9SStefano Zampini   PetscErrorCode     ierr;
29037e8381f9SStefano Zampini   cudaError_t        cerr;
29047e8381f9SStefano Zampini   PetscInt           n;
29057e8381f9SStefano Zampini 
29067e8381f9SStefano Zampini   PetscFunctionBegin;
29077e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
29087e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
29097e8381f9SStefano Zampini   if (!cusp->cooPerm) {
29107e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29117e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29127e8381f9SStefano Zampini     PetscFunctionReturn(0);
29137e8381f9SStefano Zampini   }
29147e8381f9SStefano Zampini   n = cusp->cooPerm->size();
29157e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
29167e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
29177e8381f9SStefano Zampini   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
29187e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
29197e8381f9SStefano Zampini   if (v) {
29207e8381f9SStefano Zampini     cusp->cooPerm_v->assign(v,v+n);
29217e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
29227e8381f9SStefano Zampini   }
29237e8381f9SStefano Zampini   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
29247e8381f9SStefano Zampini   if (imode == ADD_VALUES) {
29257e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
29267e8381f9SStefano Zampini       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
29277e8381f9SStefano Zampini       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
29287e8381f9SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
29297e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
29307e8381f9SStefano Zampini     } else {
29317e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
29327e8381f9SStefano Zampini                                                                 matrix->values->begin()));
29337e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
29347e8381f9SStefano Zampini                                                                 matrix->values->end()));
29357e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
29367e8381f9SStefano Zampini     }
29377e8381f9SStefano Zampini   } else {
29387e8381f9SStefano Zampini     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
29397e8381f9SStefano Zampini                               if we are inserting two different values into the same location */
29407e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
29417e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
29427e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
29437e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
29447e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
29457e8381f9SStefano Zampini     } else {
29467e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
29477e8381f9SStefano Zampini                                                                 matrix->values->begin()));
29487e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
29497e8381f9SStefano Zampini                                                                 matrix->values->end()));
29507e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
29517e8381f9SStefano Zampini     }
29527e8381f9SStefano Zampini   }
29537e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
29547e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
29557e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
29567e8381f9SStefano Zampini   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29577e8381f9SStefano Zampini   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29587e8381f9SStefano Zampini   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
29597e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
29607e8381f9SStefano Zampini   PetscFunctionReturn(0);
29617e8381f9SStefano Zampini }
29627e8381f9SStefano Zampini 
29637e8381f9SStefano Zampini #include <thrust/binary_search.h>
29647e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
29657e8381f9SStefano Zampini {
29667e8381f9SStefano Zampini   PetscErrorCode     ierr;
29677e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
29687e8381f9SStefano Zampini   CsrMatrix          *matrix;
29697e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
29707e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
29717e8381f9SStefano Zampini   cudaError_t        cerr;
29727e8381f9SStefano Zampini 
29737e8381f9SStefano Zampini   PetscFunctionBegin;
29747e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
29757e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
29767e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
29777e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
29787e8381f9SStefano Zampini   if (n != cooPerm_n) {
29797e8381f9SStefano Zampini     delete cusp->cooPerm;
29807e8381f9SStefano Zampini     delete cusp->cooPerm_v;
29817e8381f9SStefano Zampini     delete cusp->cooPerm_w;
29827e8381f9SStefano Zampini     delete cusp->cooPerm_a;
29837e8381f9SStefano Zampini     cusp->cooPerm = NULL;
29847e8381f9SStefano Zampini     cusp->cooPerm_v = NULL;
29857e8381f9SStefano Zampini     cusp->cooPerm_w = NULL;
29867e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
29877e8381f9SStefano Zampini   }
29887e8381f9SStefano Zampini   if (n) {
29897e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
29907e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
29917e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
29927e8381f9SStefano Zampini 
29937e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
29947e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
29957e8381f9SStefano Zampini 
29967e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
29977e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
29987e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
29997e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
30007e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
30017e8381f9SStefano Zampini 
30027e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
30037e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
30047e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
30057e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
30067e8381f9SStefano Zampini 
30077e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
30087e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
30097e8381f9SStefano Zampini       delete cusp->cooPerm_a;
30107e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
30117e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
30127e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
30137e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
30147e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
30157e8381f9SStefano Zampini       w[0] = 0;
30167e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
30177e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
30187e8381f9SStefano Zampini     }
30197e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
30207e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
30217e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
30227e8381f9SStefano Zampini                         ii.begin());
30237e8381f9SStefano Zampini 
30247e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
30257e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
30267e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
30277e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
30287e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
30297e8381f9SStefano Zampini     a->i[0] = 0;
30307e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
30317e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
30327e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
30337e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
30347e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
30357e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
30367e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
30377e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
30387e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
30397e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
30407e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
30417e8381f9SStefano Zampini     }
30427e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
30437e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
30447e8381f9SStefano Zampini   } else {
30457e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
30467e8381f9SStefano Zampini   }
30477e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
30487e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
30497e8381f9SStefano Zampini 
30507e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
30517e8381f9SStefano Zampini      The code is so convoluted now that I prefer to copy garbage to the GPU */
30527e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
30537e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
30547e8381f9SStefano Zampini   A->nonzerostate++;
30557e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
30567e8381f9SStefano Zampini   {
30577e8381f9SStefano Zampini     matrix = (CsrMatrix*)cusp->mat->mat;
30587e8381f9SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
30597e8381f9SStefano Zampini     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
30607e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
30617e8381f9SStefano Zampini   }
30627e8381f9SStefano Zampini 
30637e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
30647e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
30657e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
30667e8381f9SStefano Zampini   PetscFunctionReturn(0);
30677e8381f9SStefano Zampini }
3068