xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 2cbc15d9fc4a2cb47580507275a5895d7ad1e7ad)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
749ae82921SPaul Mullowney 
757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
817f756511SDominic Meiser 
827e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
837e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
847e8381f9SStefano Zampini 
85b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
86b06137fdSPaul Mullowney {
87b06137fdSPaul Mullowney   cusparseStatus_t   stat;
88b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
89b06137fdSPaul Mullowney 
90b06137fdSPaul Mullowney   PetscFunctionBegin;
91d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
92b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
9357d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
94b06137fdSPaul Mullowney   PetscFunctionReturn(0);
95b06137fdSPaul Mullowney }
96b06137fdSPaul Mullowney 
97b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
98b06137fdSPaul Mullowney {
99b06137fdSPaul Mullowney   cusparseStatus_t   stat;
100b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
101b06137fdSPaul Mullowney 
102b06137fdSPaul Mullowney   PetscFunctionBegin;
103d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1046b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
10516a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
10657d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
10716a2e217SAlejandro Lamas Daviña     }
108b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1096b1cf21dSAlejandro Lamas Daviña   }
11057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
111b06137fdSPaul Mullowney   PetscFunctionReturn(0);
112b06137fdSPaul Mullowney }
113b06137fdSPaul Mullowney 
114b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
115b06137fdSPaul Mullowney {
116b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1177e8381f9SStefano Zampini   PetscBool          flg;
1187e8381f9SStefano Zampini   PetscErrorCode     ierr;
119ccdfe979SStefano Zampini 
120b06137fdSPaul Mullowney   PetscFunctionBegin;
1217e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
123ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
124b06137fdSPaul Mullowney   PetscFunctionReturn(0);
125b06137fdSPaul Mullowney }
126b06137fdSPaul Mullowney 
127ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1289ae82921SPaul Mullowney {
1299ae82921SPaul Mullowney   PetscFunctionBegin;
1309ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1319ae82921SPaul Mullowney   PetscFunctionReturn(0);
1329ae82921SPaul Mullowney }
1339ae82921SPaul Mullowney 
134c708e6cdSJed Brown /*MC
135087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
136087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
137087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
138087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
139087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
140087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
141c708e6cdSJed Brown 
1429ae82921SPaul Mullowney   Level: beginner
143c708e6cdSJed Brown 
1443ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
145c708e6cdSJed Brown M*/
1469ae82921SPaul Mullowney 
14742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1489ae82921SPaul Mullowney {
1499ae82921SPaul Mullowney   PetscErrorCode ierr;
150bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1519ae82921SPaul Mullowney 
1529ae82921SPaul Mullowney   PetscFunctionBegin;
153bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
154bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1552c7c0729SBarry Smith   (*B)->factortype = ftype;
1562c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1579ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1582205254eSKarl Rupp 
159087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16033d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1619ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1629ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
163087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
164087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
165087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1669ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
167bc3f50f2SPaul Mullowney 
168fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1693ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1709ae82921SPaul Mullowney   PetscFunctionReturn(0);
1719ae82921SPaul Mullowney }
1729ae82921SPaul Mullowney 
173bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
174ca45077fSPaul Mullowney {
175aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1766e111a19SKarl Rupp 
177ca45077fSPaul Mullowney   PetscFunctionBegin;
178ca45077fSPaul Mullowney   switch (op) {
179e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
180aa372e3fSPaul Mullowney     cusparsestruct->format = format;
181ca45077fSPaul Mullowney     break;
182e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
183aa372e3fSPaul Mullowney     cusparsestruct->format = format;
184ca45077fSPaul Mullowney     break;
185ca45077fSPaul Mullowney   default:
18636d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
187ca45077fSPaul Mullowney   }
188ca45077fSPaul Mullowney   PetscFunctionReturn(0);
189ca45077fSPaul Mullowney }
1909ae82921SPaul Mullowney 
191e057df02SPaul Mullowney /*@
192e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
193e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
194aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
195e057df02SPaul Mullowney    Not Collective
196e057df02SPaul Mullowney 
197e057df02SPaul Mullowney    Input Parameters:
1988468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
19936d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2002692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
201e057df02SPaul Mullowney 
202e057df02SPaul Mullowney    Output Parameter:
203e057df02SPaul Mullowney 
204e057df02SPaul Mullowney    Level: intermediate
205e057df02SPaul Mullowney 
2068468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
207e057df02SPaul Mullowney @*/
208e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
209e057df02SPaul Mullowney {
210e057df02SPaul Mullowney   PetscErrorCode ierr;
2116e111a19SKarl Rupp 
212e057df02SPaul Mullowney   PetscFunctionBegin;
213e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
214e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
215e057df02SPaul Mullowney   PetscFunctionReturn(0);
216e057df02SPaul Mullowney }
217e057df02SPaul Mullowney 
218e6e9a74fSStefano Zampini /*@
219e6e9a74fSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose
220e6e9a74fSStefano Zampini 
221e6e9a74fSStefano Zampini    Collective on mat
222e6e9a74fSStefano Zampini 
223e6e9a74fSStefano Zampini    Input Parameters:
224e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
225e6e9a74fSStefano Zampini -  transgen - the boolean flag
226e6e9a74fSStefano Zampini 
227e6e9a74fSStefano Zampini    Level: intermediate
228e6e9a74fSStefano Zampini 
229e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE
230e6e9a74fSStefano Zampini @*/
231e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
232e6e9a74fSStefano Zampini {
233e6e9a74fSStefano Zampini   PetscErrorCode ierr;
234e6e9a74fSStefano Zampini   PetscBool      flg;
235e6e9a74fSStefano Zampini 
236e6e9a74fSStefano Zampini   PetscFunctionBegin;
237e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
238e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239e6e9a74fSStefano Zampini   if (flg) {
240e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24154da937aSStefano Zampini 
242e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
243e6e9a74fSStefano Zampini     cusp->transgen = transgen;
24454da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
24554da937aSStefano Zampini       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
24654da937aSStefano Zampini     }
247e6e9a74fSStefano Zampini   }
248e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
249e6e9a74fSStefano Zampini }
250e6e9a74fSStefano Zampini 
2514416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2529ae82921SPaul Mullowney {
2539ae82921SPaul Mullowney   PetscErrorCode           ierr;
254e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2559ae82921SPaul Mullowney   PetscBool                flg;
256a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2576e111a19SKarl Rupp 
2589ae82921SPaul Mullowney   PetscFunctionBegin;
259e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2609ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26154da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
26254da937aSStefano Zampini 
26354da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
264afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
265afb2bd1cSJunchao Zhang 
266e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
267a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
268afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
269afb2bd1cSJunchao Zhang 
2704c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
271a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
272afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
273afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
274afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
275afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
276afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
277afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
278afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
279afb2bd1cSJunchao Zhang 
280afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
281afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
282afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
283afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
284afb2bd1cSJunchao Zhang 
285afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
286afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
287afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
288afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
289afb2bd1cSJunchao Zhang    #endif
2904c87dfd4SPaul Mullowney   }
2910af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2929ae82921SPaul Mullowney   PetscFunctionReturn(0);
2939ae82921SPaul Mullowney }
2949ae82921SPaul Mullowney 
2956fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2969ae82921SPaul Mullowney {
297da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2989ae82921SPaul Mullowney   PetscErrorCode               ierr;
2999ae82921SPaul Mullowney 
3009ae82921SPaul Mullowney   PetscFunctionBegin;
301da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3029ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3039ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3049ae82921SPaul Mullowney   PetscFunctionReturn(0);
3059ae82921SPaul Mullowney }
3069ae82921SPaul Mullowney 
3076fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3089ae82921SPaul Mullowney {
309da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3109ae82921SPaul Mullowney   PetscErrorCode               ierr;
3119ae82921SPaul Mullowney 
3129ae82921SPaul Mullowney   PetscFunctionBegin;
313da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3149ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3159ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3169ae82921SPaul Mullowney   PetscFunctionReturn(0);
3179ae82921SPaul Mullowney }
3189ae82921SPaul Mullowney 
319087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
320087f3262SPaul Mullowney {
321da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
322087f3262SPaul Mullowney   PetscErrorCode               ierr;
323087f3262SPaul Mullowney 
324087f3262SPaul Mullowney   PetscFunctionBegin;
325da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
326087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
327087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
328087f3262SPaul Mullowney   PetscFunctionReturn(0);
329087f3262SPaul Mullowney }
330087f3262SPaul Mullowney 
331087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
332087f3262SPaul Mullowney {
333da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
334087f3262SPaul Mullowney   PetscErrorCode               ierr;
335087f3262SPaul Mullowney 
336087f3262SPaul Mullowney   PetscFunctionBegin;
337da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
338087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
339087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
340087f3262SPaul Mullowney   PetscFunctionReturn(0);
341087f3262SPaul Mullowney }
342087f3262SPaul Mullowney 
343087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3449ae82921SPaul Mullowney {
3459ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3469ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3479ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
348aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3499ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3509ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3519ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3529ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3539ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
354b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
35557d48284SJunchao Zhang   cudaError_t                       cerr;
3569ae82921SPaul Mullowney 
3579ae82921SPaul Mullowney   PetscFunctionBegin;
358cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
359c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3609ae82921SPaul Mullowney     try {
3619ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3629ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
363da79fbbcSStefano Zampini       if (!loTriFactor) {
364*2cbc15d9SMark 	PetscScalar                       *AALo;
365*2cbc15d9SMark 
366*2cbc15d9SMark 	cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3679ae82921SPaul Mullowney 
3689ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
36957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3719ae82921SPaul Mullowney 
3729ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3739ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3749ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3759ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3769ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3779ae82921SPaul Mullowney         v        = aa;
3789ae82921SPaul Mullowney         vi       = aj;
3799ae82921SPaul Mullowney         offset   = 1;
3809ae82921SPaul Mullowney         rowOffset= 1;
3819ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3829ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
383e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3849ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3859ae82921SPaul Mullowney           rowOffset += nz+1;
3869ae82921SPaul Mullowney 
387580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
388580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3899ae82921SPaul Mullowney 
3909ae82921SPaul Mullowney           offset      += nz;
3919ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3929ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3939ae82921SPaul Mullowney           offset      += 1;
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney           v  += nz;
3969ae82921SPaul Mullowney           vi += nz;
3979ae82921SPaul Mullowney         }
3982205254eSKarl Rupp 
399aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
400da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
401da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
402aa372e3fSPaul Mullowney         /* Create the matrix description */
40357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
40457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4051b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
406afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
407afb2bd1cSJunchao Zhang        #else
40857d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
409afb2bd1cSJunchao Zhang        #endif
41057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
412aa372e3fSPaul Mullowney 
413aa372e3fSPaul Mullowney         /* set the operation */
414aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
415aa372e3fSPaul Mullowney 
416aa372e3fSPaul Mullowney         /* set the matrix */
417aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
418aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
419aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
420aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
421aa372e3fSPaul Mullowney 
422aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
423aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
424aa372e3fSPaul Mullowney 
425aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
427aa372e3fSPaul Mullowney 
428aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
429aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
430aa372e3fSPaul Mullowney 
431afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
432da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
433afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4341b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
435afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
436afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
437afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
438afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
439afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
440afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
441afb2bd1cSJunchao Zhang       #endif
442afb2bd1cSJunchao Zhang 
443aa372e3fSPaul Mullowney         /* perform the solve analysis */
444aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
445aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
446aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
447afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4481b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
449afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
450afb2bd1cSJunchao Zhang                                #endif
451afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
452da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
453da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
454aa372e3fSPaul Mullowney 
455da79fbbcSStefano Zampini         /* assign the pointer */
456aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
457*2cbc15d9SMark 	loTriFactor->AA_h = AALo;
45857d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
45957d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4604863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
461da79fbbcSStefano Zampini       } else { /* update values only */
462*2cbc15d9SMark 	if (!loTriFactor->AA_h) {
463*2cbc15d9SMark 	  cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
464*2cbc15d9SMark 	}
465da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
466*2cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
467da79fbbcSStefano Zampini         v        = aa;
468da79fbbcSStefano Zampini         vi       = aj;
469da79fbbcSStefano Zampini         offset   = 1;
470da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
471da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
472*2cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
473da79fbbcSStefano Zampini           offset      += nz;
474*2cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
475da79fbbcSStefano Zampini           offset      += 1;
476da79fbbcSStefano Zampini           v  += nz;
477da79fbbcSStefano Zampini         }
478*2cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
479da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
480da79fbbcSStefano Zampini       }
4819ae82921SPaul Mullowney     } catch(char *ex) {
4829ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4839ae82921SPaul Mullowney     }
4849ae82921SPaul Mullowney   }
4859ae82921SPaul Mullowney   PetscFunctionReturn(0);
4869ae82921SPaul Mullowney }
4879ae82921SPaul Mullowney 
488087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4899ae82921SPaul Mullowney {
4909ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4919ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4929ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
493aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4949ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4959ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4969ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4979ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4989ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4999ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50057d48284SJunchao Zhang   cudaError_t                       cerr;
5019ae82921SPaul Mullowney 
5029ae82921SPaul Mullowney   PetscFunctionBegin;
503cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
504c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5059ae82921SPaul Mullowney     try {
5069ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5079ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
508da79fbbcSStefano Zampini       if (!upTriFactor) {
509*2cbc15d9SMark 	PetscScalar *AAUp;
510*2cbc15d9SMark 
511*2cbc15d9SMark 	cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
512*2cbc15d9SMark 
5139ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
51457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
51557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5169ae82921SPaul Mullowney 
5179ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5189ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5199ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5209ae82921SPaul Mullowney         offset = nzUpper;
5219ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5229ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5239ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5249ae82921SPaul Mullowney 
525e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5269ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5279ae82921SPaul Mullowney 
528e057df02SPaul Mullowney           /* decrement the offset */
5299ae82921SPaul Mullowney           offset -= (nz+1);
5309ae82921SPaul Mullowney 
531e057df02SPaul Mullowney           /* first, set the diagonal elements */
5329ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
53309f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5349ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5359ae82921SPaul Mullowney 
536580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
537580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5389ae82921SPaul Mullowney         }
5392205254eSKarl Rupp 
540aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
541da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
542da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5432205254eSKarl Rupp 
544aa372e3fSPaul Mullowney         /* Create the matrix description */
54557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
54657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5471b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
548afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
549afb2bd1cSJunchao Zhang        #else
55057d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
551afb2bd1cSJunchao Zhang        #endif
55257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
55357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
554aa372e3fSPaul Mullowney 
555aa372e3fSPaul Mullowney         /* set the operation */
556aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
557aa372e3fSPaul Mullowney 
558aa372e3fSPaul Mullowney         /* set the matrix */
559aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
560aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
561aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
562aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
563aa372e3fSPaul Mullowney 
564aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
565aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
566aa372e3fSPaul Mullowney 
567aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
568aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
569aa372e3fSPaul Mullowney 
570aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
571aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
572aa372e3fSPaul Mullowney 
573afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
574da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
575afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5761b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
577afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
578afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
579afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
580afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
581afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
582afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
583afb2bd1cSJunchao Zhang       #endif
584afb2bd1cSJunchao Zhang 
585aa372e3fSPaul Mullowney         /* perform the solve analysis */
586aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
587aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
588aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
589afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5901b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
591afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
592afb2bd1cSJunchao Zhang                                #endif
593afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
594da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
595da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
596aa372e3fSPaul Mullowney 
597da79fbbcSStefano Zampini         /* assign the pointer */
598aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
599*2cbc15d9SMark 	upTriFactor->AA_h = AAUp;
60057d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
60157d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6024863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
603da79fbbcSStefano Zampini       } else {
604*2cbc15d9SMark 	if (!upTriFactor->AA_h) {
605*2cbc15d9SMark 	  cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
606*2cbc15d9SMark 	}
607da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
608da79fbbcSStefano Zampini         offset = nzUpper;
609da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
610da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
611da79fbbcSStefano Zampini 
612da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
613da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
614da79fbbcSStefano Zampini 
615da79fbbcSStefano Zampini           /* decrement the offset */
616da79fbbcSStefano Zampini           offset -= (nz+1);
617da79fbbcSStefano Zampini 
618da79fbbcSStefano Zampini           /* first, set the diagonal elements */
619*2cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
620*2cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
621da79fbbcSStefano Zampini         }
622*2cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
623da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
624da79fbbcSStefano Zampini       }
6259ae82921SPaul Mullowney     } catch(char *ex) {
6269ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6279ae82921SPaul Mullowney     }
6289ae82921SPaul Mullowney   }
6299ae82921SPaul Mullowney   PetscFunctionReturn(0);
6309ae82921SPaul Mullowney }
6319ae82921SPaul Mullowney 
632087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6339ae82921SPaul Mullowney {
6349ae82921SPaul Mullowney   PetscErrorCode               ierr;
6359ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6369ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6379ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6389ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6399ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6409ae82921SPaul Mullowney 
6419ae82921SPaul Mullowney   PetscFunctionBegin;
642da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
643087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
644087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6452205254eSKarl Rupp 
646da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
647aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6489ae82921SPaul Mullowney 
649c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
650e057df02SPaul Mullowney   /* lower triangular indices */
6519ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
652da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
653da79fbbcSStefano Zampini     const PetscInt *r;
654da79fbbcSStefano Zampini 
655da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
656aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
657aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6589ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
659da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
660da79fbbcSStefano Zampini   }
6619ae82921SPaul Mullowney 
662e057df02SPaul Mullowney   /* upper triangular indices */
6639ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
664da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
665da79fbbcSStefano Zampini     const PetscInt *c;
666da79fbbcSStefano Zampini 
667da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
668aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
669aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6709ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
671da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
672da79fbbcSStefano Zampini   }
6739ae82921SPaul Mullowney   PetscFunctionReturn(0);
6749ae82921SPaul Mullowney }
6759ae82921SPaul Mullowney 
676087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
677087f3262SPaul Mullowney {
678087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
679087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
680aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
681aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
682087f3262SPaul Mullowney   cusparseStatus_t                  stat;
683087f3262SPaul Mullowney   PetscErrorCode                    ierr;
68457d48284SJunchao Zhang   cudaError_t                       cerr;
685087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
686087f3262SPaul Mullowney   PetscScalar                       *AAUp;
687087f3262SPaul Mullowney   PetscScalar                       *AALo;
688087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
689087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
690087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
691087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
692087f3262SPaul Mullowney 
693087f3262SPaul Mullowney   PetscFunctionBegin;
694cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
695c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
696087f3262SPaul Mullowney     try {
697da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
698da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
699da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
700087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
70157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
70257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
703087f3262SPaul Mullowney 
704087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
705087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
706087f3262SPaul Mullowney         AiUp[n]=nzUpper;
707087f3262SPaul Mullowney         offset = 0;
708087f3262SPaul Mullowney         for (i=0; i<n; i++) {
709087f3262SPaul Mullowney           /* set the pointers */
710087f3262SPaul Mullowney           v  = aa + ai[i];
711087f3262SPaul Mullowney           vj = aj + ai[i];
712087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
713087f3262SPaul Mullowney 
714087f3262SPaul Mullowney           /* first, set the diagonal elements */
715087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
71609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
717087f3262SPaul Mullowney           AiUp[i]      = offset;
71809f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
719087f3262SPaul Mullowney 
720087f3262SPaul Mullowney           offset+=1;
721087f3262SPaul Mullowney           if (nz>0) {
722f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
723580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
724087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
725087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
726087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
727087f3262SPaul Mullowney             }
728087f3262SPaul Mullowney             offset+=nz;
729087f3262SPaul Mullowney           }
730087f3262SPaul Mullowney         }
731087f3262SPaul Mullowney 
732aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
733da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
734da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
735087f3262SPaul Mullowney 
736aa372e3fSPaul Mullowney         /* Create the matrix description */
73757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
73857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7391b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
740afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
741afb2bd1cSJunchao Zhang        #else
74257d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
743afb2bd1cSJunchao Zhang        #endif
74457d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
74557d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
746087f3262SPaul Mullowney 
747aa372e3fSPaul Mullowney         /* set the matrix */
748aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
749aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
750aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
751aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
752aa372e3fSPaul Mullowney 
753aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
754aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
755aa372e3fSPaul Mullowney 
756aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
757aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
758aa372e3fSPaul Mullowney 
759aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
760aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
761aa372e3fSPaul Mullowney 
762afb2bd1cSJunchao Zhang         /* set the operation */
763afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
764afb2bd1cSJunchao Zhang 
765afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
766da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
767afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7681b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
770afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
771afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
772afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
773afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
774afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
775afb2bd1cSJunchao Zhang       #endif
776afb2bd1cSJunchao Zhang 
777aa372e3fSPaul Mullowney         /* perform the solve analysis */
778aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
779aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
780aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
781afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7821b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
783afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
784afb2bd1cSJunchao Zhang                                 #endif
785afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
786da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
787da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
788aa372e3fSPaul Mullowney 
789da79fbbcSStefano Zampini         /* assign the pointer */
790aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
791aa372e3fSPaul Mullowney 
792aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
793da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
794da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
795aa372e3fSPaul Mullowney 
796aa372e3fSPaul Mullowney         /* Create the matrix description */
79757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
79857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7991b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
800afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
801afb2bd1cSJunchao Zhang        #else
80257d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
803afb2bd1cSJunchao Zhang        #endif
80457d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
80557d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
806aa372e3fSPaul Mullowney 
807aa372e3fSPaul Mullowney         /* set the operation */
808aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
809aa372e3fSPaul Mullowney 
810aa372e3fSPaul Mullowney         /* set the matrix */
811aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
812aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
813aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
814aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
815aa372e3fSPaul Mullowney 
816aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
817aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
818aa372e3fSPaul Mullowney 
819aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
820aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
821aa372e3fSPaul Mullowney 
822aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
823aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
824aa372e3fSPaul Mullowney 
825afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
826da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
827afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8281b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
829afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
830afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
831afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
832afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
833afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
834afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
835afb2bd1cSJunchao Zhang       #endif
836afb2bd1cSJunchao Zhang 
837aa372e3fSPaul Mullowney         /* perform the solve analysis */
838aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
839aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
840aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
841afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8421b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
843afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
844afb2bd1cSJunchao Zhang                                 #endif
845afb2bd1cSJunchao Zhang                                 );CHKERRCUSPARSE(stat);
846da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
847da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
848aa372e3fSPaul Mullowney 
849da79fbbcSStefano Zampini         /* assign the pointer */
850aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
851087f3262SPaul Mullowney 
852da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
85357d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
85457d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
855da79fbbcSStefano Zampini       } else {
856da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
857da79fbbcSStefano Zampini         offset = 0;
858da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
859da79fbbcSStefano Zampini           /* set the pointers */
860da79fbbcSStefano Zampini           v  = aa + ai[i];
861da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
862da79fbbcSStefano Zampini 
863da79fbbcSStefano Zampini           /* first, set the diagonal elements */
864da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
865da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
866da79fbbcSStefano Zampini 
867da79fbbcSStefano Zampini           offset+=1;
868da79fbbcSStefano Zampini           if (nz>0) {
869da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
870da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
871da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
872da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
873da79fbbcSStefano Zampini             }
874da79fbbcSStefano Zampini             offset+=nz;
875da79fbbcSStefano Zampini           }
876da79fbbcSStefano Zampini         }
877da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
878da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
879da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
880da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
881da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
882da79fbbcSStefano Zampini       }
88357d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
88457d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
885087f3262SPaul Mullowney     } catch(char *ex) {
886087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
887087f3262SPaul Mullowney     }
888087f3262SPaul Mullowney   }
889087f3262SPaul Mullowney   PetscFunctionReturn(0);
890087f3262SPaul Mullowney }
891087f3262SPaul Mullowney 
892087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8939ae82921SPaul Mullowney {
8949ae82921SPaul Mullowney   PetscErrorCode               ierr;
895087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
896087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
897087f3262SPaul Mullowney   IS                           ip = a->row;
898087f3262SPaul Mullowney   PetscBool                    perm_identity;
899087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
900087f3262SPaul Mullowney 
901087f3262SPaul Mullowney   PetscFunctionBegin;
902da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
903087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
904da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
905aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
906aa372e3fSPaul Mullowney 
907da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
908da79fbbcSStefano Zampini 
909087f3262SPaul Mullowney   /* lower triangular indices */
910087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
911087f3262SPaul Mullowney   if (!perm_identity) {
9124e4bbfaaSStefano Zampini     IS             iip;
913da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9144e4bbfaaSStefano Zampini 
9154e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9164e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
917da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
918aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
919aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
920aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9214e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9224e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9234e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
924087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
925da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
926da79fbbcSStefano Zampini   }
927087f3262SPaul Mullowney   PetscFunctionReturn(0);
928087f3262SPaul Mullowney }
929087f3262SPaul Mullowney 
9306fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9319ae82921SPaul Mullowney {
9329ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9339ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9349ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
935b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9369ae82921SPaul Mullowney 
9379ae82921SPaul Mullowney   PetscFunctionBegin;
9389ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
939ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
940e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9419ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9429ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
943bda325fcSPaul Mullowney   if (row_identity && col_identity) {
944bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
945bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9464e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9474e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
948bda325fcSPaul Mullowney   } else {
949bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
950bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9514e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9524e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
953bda325fcSPaul Mullowney   }
9548dc1d2a3SPaul Mullowney 
955e057df02SPaul Mullowney   /* get the triangular factors */
956087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9579ae82921SPaul Mullowney   PetscFunctionReturn(0);
9589ae82921SPaul Mullowney }
9599ae82921SPaul Mullowney 
960087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
961087f3262SPaul Mullowney {
962087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
963087f3262SPaul Mullowney   IS             ip = b->row;
964087f3262SPaul Mullowney   PetscBool      perm_identity;
965b175d8bbSPaul Mullowney   PetscErrorCode ierr;
966087f3262SPaul Mullowney 
967087f3262SPaul Mullowney   PetscFunctionBegin;
968087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
969ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
970087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
971087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
972087f3262SPaul Mullowney   if (perm_identity) {
973087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
974087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9754e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9764e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
977087f3262SPaul Mullowney   } else {
978087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
979087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9804e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9814e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
982087f3262SPaul Mullowney   }
983087f3262SPaul Mullowney 
984087f3262SPaul Mullowney   /* get the triangular factors */
985087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
986087f3262SPaul Mullowney   PetscFunctionReturn(0);
987087f3262SPaul Mullowney }
9889ae82921SPaul Mullowney 
989b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
990bda325fcSPaul Mullowney {
991bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
992aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
993aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
994da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
995da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
996bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
997aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
998aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
999aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1000aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10011b0a6780SStefano Zampini   cudaError_t                       cerr;
1002da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1003b175d8bbSPaul Mullowney 
1004bda325fcSPaul Mullowney   PetscFunctionBegin;
1005aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1006da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1007da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1008aa372e3fSPaul Mullowney 
1009aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1010aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1011aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1012aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1013aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1014aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1015aa372e3fSPaul Mullowney 
1016aa372e3fSPaul Mullowney   /* Create the matrix description */
101757d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
101857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
101957d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
102057d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
102157d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1022aa372e3fSPaul Mullowney 
1023aa372e3fSPaul Mullowney   /* set the operation */
1024aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1025aa372e3fSPaul Mullowney 
1026aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1027aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1028afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1029afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1030aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1031afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1032afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1033afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1034aa372e3fSPaul Mullowney 
1035aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1036afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1037afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1038afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1039afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1040afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1041afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1042afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1043afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1044afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1045afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10461b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1047afb2bd1cSJunchao Zhang #endif
1048afb2bd1cSJunchao Zhang 
1049da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1050aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1051aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1052aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1053aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1054aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1055aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1056afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1057afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1058afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1059afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1060afb2bd1cSJunchao Zhang                         #else
1061afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1062afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1063afb2bd1cSJunchao Zhang                         #endif
1064afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1065da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1066da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1067aa372e3fSPaul Mullowney 
1068afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1069da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1070afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1072afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1073afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1074afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1076afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1077afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1078afb2bd1cSJunchao Zhang #endif
1079afb2bd1cSJunchao Zhang 
1080afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1081aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1082afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1083afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1084afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10851b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1086afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1087afb2bd1cSJunchao Zhang                           #endif
1088afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1089da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1090da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1091aa372e3fSPaul Mullowney 
1092da79fbbcSStefano Zampini   /* assign the pointer */
1093aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1094aa372e3fSPaul Mullowney 
1095aa372e3fSPaul Mullowney   /*********************************************/
1096aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1097aa372e3fSPaul Mullowney   /*********************************************/
1098aa372e3fSPaul Mullowney 
1099aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1100da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1101da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1102aa372e3fSPaul Mullowney 
1103aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1104aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1105aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1106aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1107aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1108aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1109aa372e3fSPaul Mullowney 
1110aa372e3fSPaul Mullowney   /* Create the matrix description */
111157d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
111257d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
111357d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
111457d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
111557d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1116aa372e3fSPaul Mullowney 
1117aa372e3fSPaul Mullowney   /* set the operation */
1118aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1119aa372e3fSPaul Mullowney 
1120aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1121aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1122afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1123afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1124aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1125afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1126afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1127afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1128aa372e3fSPaul Mullowney 
1129aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1130afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1131afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1132afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1133afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1134afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1135afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1136afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1137afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1138afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1139afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1140afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1141afb2bd1cSJunchao Zhang #endif
1142afb2bd1cSJunchao Zhang 
1143da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1144aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1145aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1146aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1147aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1148aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1149aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1150afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1151afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1152afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1153afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1154afb2bd1cSJunchao Zhang                         #else
1155afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1156afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1157afb2bd1cSJunchao Zhang                         #endif
1158afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1159da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1160da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1161aa372e3fSPaul Mullowney 
1162afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1163da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1164afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11651b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1166afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1167afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1168afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1170afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1171afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1172afb2bd1cSJunchao Zhang   #endif
1173afb2bd1cSJunchao Zhang 
1174afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1175aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1176afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1177afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1178afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11791b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1180afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1181afb2bd1cSJunchao Zhang                           #endif
1182afb2bd1cSJunchao Zhang                           );CHKERRCUSPARSE(stat);
1183da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1184da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1185aa372e3fSPaul Mullowney 
1186da79fbbcSStefano Zampini   /* assign the pointer */
1187aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1188bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1189bda325fcSPaul Mullowney }
1190bda325fcSPaul Mullowney 
1191b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1192bda325fcSPaul Mullowney {
1193aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1194aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1195aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1196bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1197bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1198aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1199b06137fdSPaul Mullowney   cudaError_t                  err;
120085ba7357SStefano Zampini   PetscErrorCode               ierr;
1201b175d8bbSPaul Mullowney 
1202bda325fcSPaul Mullowney   PetscFunctionBegin;
120385ba7357SStefano Zampini   if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0);
120485ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
120585ba7357SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
120685ba7357SStefano Zampini   /* create cusparse matrix */
1207aa372e3fSPaul Mullowney   matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
120857d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1209aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(matstruct->descr);
121057d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
121157d48284SJunchao Zhang   stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1212aa372e3fSPaul Mullowney 
1213b06137fdSPaul Mullowney   /* set alpha and beta */
1214afb2bd1cSJunchao Zhang   err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12157656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12167656d835SStefano Zampini   err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1217afb2bd1cSJunchao Zhang   err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12187656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12197656d835SStefano Zampini   err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
122057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1221b06137fdSPaul Mullowney 
1222aa372e3fSPaul Mullowney   if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1223aa372e3fSPaul Mullowney     CsrMatrix *matrix = (CsrMatrix*)matstruct->mat;
1224aa372e3fSPaul Mullowney     CsrMatrix *matrixT= new CsrMatrix;
1225554b8892SKarl Rupp     matrixT->num_rows = A->cmap->n;
1226554b8892SKarl Rupp     matrixT->num_cols = A->rmap->n;
1227aa372e3fSPaul Mullowney     matrixT->num_entries = a->nz;
1228a8bd5306SMark Adams     matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1229aa372e3fSPaul Mullowney     matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1230aa372e3fSPaul Mullowney     matrixT->values = new THRUSTARRAY(a->nz);
1231a3fdcf43SKarl Rupp 
123281902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
123381902715SJunchao Zhang     cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1234afb2bd1cSJunchao Zhang 
123581902715SJunchao Zhang     /* compute the transpose, i.e. the CSC */
1236afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1237afb2bd1cSJunchao Zhang     stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1238afb2bd1cSJunchao Zhang                                   A->cmap->n, matrix->num_entries,
1239afb2bd1cSJunchao Zhang                                   matrix->values->data().get(),
1240afb2bd1cSJunchao Zhang                                   cusparsestruct->rowoffsets_gpu->data().get(),
1241afb2bd1cSJunchao Zhang                                   matrix->column_indices->data().get(),
1242afb2bd1cSJunchao Zhang                                   matrixT->values->data().get(),
1243afb2bd1cSJunchao Zhang                                   matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1244afb2bd1cSJunchao Zhang                                   CUSPARSE_ACTION_NUMERIC,indexBase,
1245afb2bd1cSJunchao Zhang                                   cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1246afb2bd1cSJunchao Zhang     err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err);
1247afb2bd1cSJunchao Zhang    #endif
1248afb2bd1cSJunchao Zhang 
1249a3fdcf43SKarl Rupp     stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1250a3fdcf43SKarl Rupp                             A->cmap->n, matrix->num_entries,
1251aa372e3fSPaul Mullowney                             matrix->values->data().get(),
125281902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1253aa372e3fSPaul Mullowney                             matrix->column_indices->data().get(),
1254aa372e3fSPaul Mullowney                             matrixT->values->data().get(),
1255afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1256afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1257afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1258afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1259afb2bd1cSJunchao Zhang                           #else
1260afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1261afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1262afb2bd1cSJunchao Zhang                           #endif
1263afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
1264aa372e3fSPaul Mullowney     matstructT->mat = matrixT;
1265afb2bd1cSJunchao Zhang 
1266afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1267afb2bd1cSJunchao Zhang     stat = cusparseCreateCsr(&matstructT->matDescr,
1268afb2bd1cSJunchao Zhang                              matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1269afb2bd1cSJunchao Zhang                              matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1270afb2bd1cSJunchao Zhang                              matrixT->values->data().get(),
1271afb2bd1cSJunchao Zhang                              CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1272afb2bd1cSJunchao Zhang                              indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1273afb2bd1cSJunchao Zhang    #endif
1274aa372e3fSPaul Mullowney   } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1275afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1276afb2bd1cSJunchao Zhang     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1277afb2bd1cSJunchao Zhang    #else
1278aa372e3fSPaul Mullowney     CsrMatrix *temp  = new CsrMatrix;
127951c6d536SStefano Zampini     CsrMatrix *tempT = new CsrMatrix;
128051c6d536SStefano Zampini     /* First convert HYB to CSR */
1281aa372e3fSPaul Mullowney     temp->num_rows = A->rmap->n;
1282aa372e3fSPaul Mullowney     temp->num_cols = A->cmap->n;
1283aa372e3fSPaul Mullowney     temp->num_entries = a->nz;
1284aa372e3fSPaul Mullowney     temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1285aa372e3fSPaul Mullowney     temp->column_indices = new THRUSTINTARRAY32(a->nz);
1286aa372e3fSPaul Mullowney     temp->values = new THRUSTARRAY(a->nz);
1287aa372e3fSPaul Mullowney 
1288aa372e3fSPaul Mullowney     stat = cusparse_hyb2csr(cusparsestruct->handle,
1289aa372e3fSPaul Mullowney                             matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1290aa372e3fSPaul Mullowney                             temp->values->data().get(),
1291aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
129257d48284SJunchao Zhang                             temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1293aa372e3fSPaul Mullowney 
1294aa372e3fSPaul Mullowney     /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1295aa372e3fSPaul Mullowney     tempT->num_rows = A->rmap->n;
1296aa372e3fSPaul Mullowney     tempT->num_cols = A->cmap->n;
1297aa372e3fSPaul Mullowney     tempT->num_entries = a->nz;
1298aa372e3fSPaul Mullowney     tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1299aa372e3fSPaul Mullowney     tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1300aa372e3fSPaul Mullowney     tempT->values = new THRUSTARRAY(a->nz);
1301aa372e3fSPaul Mullowney 
1302aa372e3fSPaul Mullowney     stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1303aa372e3fSPaul Mullowney                             temp->num_cols, temp->num_entries,
1304aa372e3fSPaul Mullowney                             temp->values->data().get(),
1305aa372e3fSPaul Mullowney                             temp->row_offsets->data().get(),
1306aa372e3fSPaul Mullowney                             temp->column_indices->data().get(),
1307aa372e3fSPaul Mullowney                             tempT->values->data().get(),
1308aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
1309aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
131057d48284SJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1311aa372e3fSPaul Mullowney 
1312aa372e3fSPaul Mullowney     /* Last, convert CSC to HYB */
1313aa372e3fSPaul Mullowney     cusparseHybMat_t hybMat;
131457d48284SJunchao Zhang     stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1315aa372e3fSPaul Mullowney     cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1316aa372e3fSPaul Mullowney       CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1317aa372e3fSPaul Mullowney     stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1318aa372e3fSPaul Mullowney                             matstructT->descr, tempT->values->data().get(),
1319aa372e3fSPaul Mullowney                             tempT->row_offsets->data().get(),
1320aa372e3fSPaul Mullowney                             tempT->column_indices->data().get(),
132157d48284SJunchao Zhang                             hybMat, 0, partition);CHKERRCUSPARSE(stat);
1322aa372e3fSPaul Mullowney 
1323aa372e3fSPaul Mullowney     /* assign the pointer */
1324aa372e3fSPaul Mullowney     matstructT->mat = hybMat;
1325aa372e3fSPaul Mullowney     /* delete temporaries */
1326aa372e3fSPaul Mullowney     if (tempT) {
1327aa372e3fSPaul Mullowney       if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1328aa372e3fSPaul Mullowney       if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1329aa372e3fSPaul Mullowney       if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1330aa372e3fSPaul Mullowney       delete (CsrMatrix*) tempT;
1331087f3262SPaul Mullowney     }
1332aa372e3fSPaul Mullowney     if (temp) {
1333aa372e3fSPaul Mullowney       if (temp->values) delete (THRUSTARRAY*) temp->values;
1334aa372e3fSPaul Mullowney       if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1335aa372e3fSPaul Mullowney       if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1336aa372e3fSPaul Mullowney       delete (CsrMatrix*) temp;
1337aa372e3fSPaul Mullowney     }
1338afb2bd1cSJunchao Zhang    #endif
1339aa372e3fSPaul Mullowney   }
134005035670SJunchao Zhang   err  = WaitForCUDA();CHKERRCUDA(err);
134185ba7357SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
134285ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1343213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1344213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1345aa372e3fSPaul Mullowney   /* assign the pointer */
1346aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1347bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1348bda325fcSPaul Mullowney }
1349bda325fcSPaul Mullowney 
13504e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13516fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1352bda325fcSPaul Mullowney {
1353c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1354465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1355465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1356465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1357465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1358bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1359bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1360aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1361aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1362aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1363b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
136457d48284SJunchao Zhang   cudaError_t                           cerr;
1365bda325fcSPaul Mullowney 
1366bda325fcSPaul Mullowney   PetscFunctionBegin;
1367aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1368aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1369bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1370aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1371aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1372bda325fcSPaul Mullowney   }
1373bda325fcSPaul Mullowney 
1374bda325fcSPaul Mullowney   /* Get the GPU pointers */
1375c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1376c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1377c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1378c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1379bda325fcSPaul Mullowney 
13807a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1381aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1382c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1383c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1384c41cb2e2SAlejandro Lamas Daviña                xGPU);
1385aa372e3fSPaul Mullowney 
1386aa372e3fSPaul Mullowney   /* First, solve U */
1387aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1388afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
13891b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1390afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1391afb2bd1cSJunchao Zhang                       #endif
1392afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1393aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1394aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1395aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1396aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1397afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
13981b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1399afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1400afb2bd1cSJunchao Zhang                       #endif
1401afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1402aa372e3fSPaul Mullowney 
1403aa372e3fSPaul Mullowney   /* Then, solve L */
1404aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1405afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14061b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1407afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1408afb2bd1cSJunchao Zhang                       #endif
1409afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1410aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1411aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1412aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1413aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1414afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14151b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1416afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1417afb2bd1cSJunchao Zhang                       #endif
1418afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1419aa372e3fSPaul Mullowney 
1420aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1421c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1422c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1423aa372e3fSPaul Mullowney                tempGPU->begin());
1424aa372e3fSPaul Mullowney 
1425aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1426c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1427bda325fcSPaul Mullowney 
1428bda325fcSPaul Mullowney   /* restore */
1429c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1430c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
143105035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1432661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1433958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1434bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1435bda325fcSPaul Mullowney }
1436bda325fcSPaul Mullowney 
14376fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1438bda325fcSPaul Mullowney {
1439465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1440465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1441bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1442bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1443aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1444aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1445aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1446b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
144757d48284SJunchao Zhang   cudaError_t                       cerr;
1448bda325fcSPaul Mullowney 
1449bda325fcSPaul Mullowney   PetscFunctionBegin;
1450aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1451aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1452bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1453aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1454aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1455bda325fcSPaul Mullowney   }
1456bda325fcSPaul Mullowney 
1457bda325fcSPaul Mullowney   /* Get the GPU pointers */
1458c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1459c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1460bda325fcSPaul Mullowney 
14617a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1462aa372e3fSPaul Mullowney   /* First, solve U */
1463aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1464afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14651b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1466afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1467afb2bd1cSJunchao Zhang                       #endif
1468afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1469aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1470aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1471aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1472aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1473afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
14741b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1475afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1476afb2bd1cSJunchao Zhang                       #endif
1477afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1478aa372e3fSPaul Mullowney 
1479aa372e3fSPaul Mullowney   /* Then, solve L */
1480aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1481afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14821b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1483afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1484afb2bd1cSJunchao Zhang                       #endif
1485afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1486aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1487aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1488aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1489aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1490afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14911b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1492afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1493afb2bd1cSJunchao Zhang                       #endif
1494afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1495bda325fcSPaul Mullowney 
1496bda325fcSPaul Mullowney   /* restore */
1497c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1498c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
149905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1500661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1501958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1502bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1503bda325fcSPaul Mullowney }
1504bda325fcSPaul Mullowney 
15056fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15069ae82921SPaul Mullowney {
1507465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1508465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1509465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1510465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15119ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15129ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1513aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1514aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1515aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1516b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
151757d48284SJunchao Zhang   cudaError_t                           cerr;
15189ae82921SPaul Mullowney 
15199ae82921SPaul Mullowney   PetscFunctionBegin;
1520ebc8f436SDominic Meiser 
1521e057df02SPaul Mullowney   /* Get the GPU pointers */
1522c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1523c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1524c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1525c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15269ae82921SPaul Mullowney 
15277a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1528aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1529c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1530c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15314e4bbfaaSStefano Zampini                tempGPU->begin());
1532aa372e3fSPaul Mullowney 
1533aa372e3fSPaul Mullowney   /* Next, solve L */
1534aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1535afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15361b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1538afb2bd1cSJunchao Zhang                       #endif
1539afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1540aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1541aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1542aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1543aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1544afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15451b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1547afb2bd1cSJunchao Zhang                       #endif
1548afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1549aa372e3fSPaul Mullowney 
1550aa372e3fSPaul Mullowney   /* Then, solve U */
1551aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1552afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15531b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1554afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1555afb2bd1cSJunchao Zhang                       #endif
1556afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1557aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1558aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1559aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1560aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1561afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
15621b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1563afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1564afb2bd1cSJunchao Zhang                       #endif
1565afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1566aa372e3fSPaul Mullowney 
15674e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
15684e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
15694e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
15704e4bbfaaSStefano Zampini                xGPU);
15719ae82921SPaul Mullowney 
1572c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1573c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
157405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1575661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1576958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
15779ae82921SPaul Mullowney   PetscFunctionReturn(0);
15789ae82921SPaul Mullowney }
15799ae82921SPaul Mullowney 
15806fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
15819ae82921SPaul Mullowney {
1582465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1583465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
15849ae82921SPaul Mullowney   cusparseStatus_t                  stat;
15859ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1586aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1587aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1588aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1589b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
159057d48284SJunchao Zhang   cudaError_t                       cerr;
15919ae82921SPaul Mullowney 
15929ae82921SPaul Mullowney   PetscFunctionBegin;
1593e057df02SPaul Mullowney   /* Get the GPU pointers */
1594c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1595c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
15969ae82921SPaul Mullowney 
15977a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1598aa372e3fSPaul Mullowney   /* First, solve L */
1599aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1600afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16011b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1602afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1603afb2bd1cSJunchao Zhang                       #endif
1604afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1605aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1606aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1607aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1608aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1609afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16101b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1611afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1612afb2bd1cSJunchao Zhang                       #endif
1613afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
1614aa372e3fSPaul Mullowney 
1615aa372e3fSPaul Mullowney   /* Next, solve U */
1616aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1617afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16181b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1619afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1620afb2bd1cSJunchao Zhang                       #endif
1621afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1622aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1623aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1624aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1625aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1626afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16271b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1628afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1629afb2bd1cSJunchao Zhang                       #endif
1630afb2bd1cSJunchao Zhang                         );CHKERRCUSPARSE(stat);
16319ae82921SPaul Mullowney 
1632c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1633c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
163405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1635661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1636958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16379ae82921SPaul Mullowney   PetscFunctionReturn(0);
16389ae82921SPaul Mullowney }
16399ae82921SPaul Mullowney 
16407e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16417e8381f9SStefano Zampini {
16427e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16437e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16447e8381f9SStefano Zampini   cudaError_t        cerr;
16457e8381f9SStefano Zampini   PetscErrorCode     ierr;
16467e8381f9SStefano Zampini 
16477e8381f9SStefano Zampini   PetscFunctionBegin;
16487e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16497e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16507e8381f9SStefano Zampini 
16517e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16527e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16537e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16547e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
16557e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16567e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
16577e8381f9SStefano Zampini   }
16587e8381f9SStefano Zampini   PetscFunctionReturn(0);
16597e8381f9SStefano Zampini }
16607e8381f9SStefano Zampini 
16617e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
16627e8381f9SStefano Zampini {
16637e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
16647e8381f9SStefano Zampini   PetscErrorCode ierr;
16657e8381f9SStefano Zampini 
16667e8381f9SStefano Zampini   PetscFunctionBegin;
16677e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
16687e8381f9SStefano Zampini   *array = a->a;
16697e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
16707e8381f9SStefano Zampini   PetscFunctionReturn(0);
16717e8381f9SStefano Zampini }
16727e8381f9SStefano Zampini 
16736fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
16749ae82921SPaul Mullowney {
1675aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
16767c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
16779ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1678213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
16799ae82921SPaul Mullowney   PetscErrorCode               ierr;
1680aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1681b06137fdSPaul Mullowney   cudaError_t                  err;
16829ae82921SPaul Mullowney 
16839ae82921SPaul Mullowney   PetscFunctionBegin;
168495639643SRichard Tran Mills   if (A->boundtocpu) PetscFunctionReturn(0);
1685c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
168681902715SJunchao Zhang     if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) {
168781902715SJunchao Zhang       /* Copy values only */
1688afb2bd1cSJunchao Zhang       CsrMatrix *matrix,*matrixT;
1689afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
169085ba7357SStefano Zampini 
169185ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1692afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
169305035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
16944863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
169585ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
169681902715SJunchao Zhang 
169781902715SJunchao Zhang       /* Update matT when it was built before */
169881902715SJunchao Zhang       if (cusparsestruct->matTranspose) {
169981902715SJunchao Zhang         cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr);
1700afb2bd1cSJunchao Zhang         matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat;
170185ba7357SStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
170281902715SJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1703afb2bd1cSJunchao Zhang                             A->cmap->n, matrix->num_entries,
1704afb2bd1cSJunchao Zhang                             matrix->values->data().get(),
170581902715SJunchao Zhang                             cusparsestruct->rowoffsets_gpu->data().get(),
1706afb2bd1cSJunchao Zhang                             matrix->column_indices->data().get(),
1707afb2bd1cSJunchao Zhang                             matrixT->values->data().get(),
1708afb2bd1cSJunchao Zhang                           #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1709afb2bd1cSJunchao Zhang                             matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1710afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC,indexBase,
1711afb2bd1cSJunchao Zhang                             cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer
1712afb2bd1cSJunchao Zhang                           #else
1713afb2bd1cSJunchao Zhang                             matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1714afb2bd1cSJunchao Zhang                             CUSPARSE_ACTION_NUMERIC, indexBase
1715afb2bd1cSJunchao Zhang                           #endif
1716afb2bd1cSJunchao Zhang                            );CHKERRCUSPARSE(stat);
171705035670SJunchao Zhang         err  = WaitForCUDA();CHKERRCUDA(err);
171885ba7357SStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
171981902715SJunchao Zhang       }
172034d6c7a5SJose E. Roman     } else {
172185ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17227c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
17237c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr);
17247c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
172581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
17269ae82921SPaul Mullowney       try {
17279ae82921SPaul Mullowney         if (a->compressedrow.use) {
17289ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17299ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17309ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17319ae82921SPaul Mullowney         } else {
1732213423ffSJunchao Zhang           m    = A->rmap->n;
1733213423ffSJunchao Zhang           ii   = a->i;
1734e6e9a74fSStefano Zampini           ridx = NULL;
17359ae82921SPaul Mullowney         }
1736213423ffSJunchao Zhang         cusparsestruct->nrows = m;
17379ae82921SPaul Mullowney 
173885ba7357SStefano Zampini         /* create cusparse matrix */
1739aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
174057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
174157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
174257d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17439ae82921SPaul Mullowney 
1744afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17457656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17467656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1747afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17487656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17497656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
175057d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1751b06137fdSPaul Mullowney 
1752aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1753aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1754aa372e3fSPaul Mullowney           /* set the matrix */
1755afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1756afb2bd1cSJunchao Zhang           mat->num_rows = m;
1757afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1758afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1759afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1760afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17619ae82921SPaul Mullowney 
1762afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1763afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1764aa372e3fSPaul Mullowney 
1765afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1766afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1767aa372e3fSPaul Mullowney 
1768aa372e3fSPaul Mullowney           /* assign the pointer */
1769afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1770afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1771afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1772afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1773afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1774afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1775afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1776afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1777afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1778afb2bd1cSJunchao Zhang           }
1779afb2bd1cSJunchao Zhang          #endif
1780aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1781afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1782afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1783afb2bd1cSJunchao Zhang          #else
1784afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1785afb2bd1cSJunchao Zhang           mat->num_rows = m;
1786afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1787afb2bd1cSJunchao Zhang           mat->num_entries = a->nz;
1788afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1789afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1790aa372e3fSPaul Mullowney 
1791afb2bd1cSJunchao Zhang           mat->column_indices = new THRUSTINTARRAY32(a->nz);
1792afb2bd1cSJunchao Zhang           mat->column_indices->assign(a->j, a->j+a->nz);
1793aa372e3fSPaul Mullowney 
1794afb2bd1cSJunchao Zhang           mat->values = new THRUSTARRAY(a->nz);
1795afb2bd1cSJunchao Zhang           mat->values->assign(a->a, a->a+a->nz);
1796aa372e3fSPaul Mullowney 
1797aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
179857d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1799aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1800aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1801afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1802afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1803afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1804afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
180557d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1806aa372e3fSPaul Mullowney           /* assign the pointer */
1807aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1808aa372e3fSPaul Mullowney 
1809afb2bd1cSJunchao Zhang           if (mat) {
1810afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1811afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1812afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1813afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1814087f3262SPaul Mullowney           }
1815afb2bd1cSJunchao Zhang          #endif
1816087f3262SPaul Mullowney         }
1817ca45077fSPaul Mullowney 
1818aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1819213423ffSJunchao Zhang         if (a->compressedrow.use) {
1820213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1821aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1822aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1823213423ffSJunchao Zhang           tmp = m;
1824213423ffSJunchao Zhang         } else {
1825213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1826213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1827213423ffSJunchao Zhang           tmp = 0;
1828213423ffSJunchao Zhang         }
1829213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1830aa372e3fSPaul Mullowney 
1831aa372e3fSPaul Mullowney         /* assign the pointer */
1832aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18339ae82921SPaul Mullowney       } catch(char *ex) {
18349ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18359ae82921SPaul Mullowney       }
183605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
183785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
183834d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
183934d6c7a5SJose E. Roman     }
1840c70f7ee4SJunchao Zhang     A->offloadmask = PETSC_OFFLOAD_BOTH;
18419ae82921SPaul Mullowney   }
18429ae82921SPaul Mullowney   PetscFunctionReturn(0);
18439ae82921SPaul Mullowney }
18449ae82921SPaul Mullowney 
1845c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1846aa372e3fSPaul Mullowney {
1847aa372e3fSPaul Mullowney   template <typename Tuple>
1848aa372e3fSPaul Mullowney   __host__ __device__
1849aa372e3fSPaul Mullowney   void operator()(Tuple t)
1850aa372e3fSPaul Mullowney   {
1851aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1852aa372e3fSPaul Mullowney   }
1853aa372e3fSPaul Mullowney };
1854aa372e3fSPaul Mullowney 
18557e8381f9SStefano Zampini struct VecCUDAEquals
18567e8381f9SStefano Zampini {
18577e8381f9SStefano Zampini   template <typename Tuple>
18587e8381f9SStefano Zampini   __host__ __device__
18597e8381f9SStefano Zampini   void operator()(Tuple t)
18607e8381f9SStefano Zampini   {
18617e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18627e8381f9SStefano Zampini   }
18637e8381f9SStefano Zampini };
18647e8381f9SStefano Zampini 
1865e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1866e6e9a74fSStefano Zampini {
1867e6e9a74fSStefano Zampini   template <typename Tuple>
1868e6e9a74fSStefano Zampini   __host__ __device__
1869e6e9a74fSStefano Zampini   void operator()(Tuple t)
1870e6e9a74fSStefano Zampini   {
1871e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1872e6e9a74fSStefano Zampini   }
1873e6e9a74fSStefano Zampini };
1874e6e9a74fSStefano Zampini 
1875afb2bd1cSJunchao Zhang struct MatMatCusparse {
1876ccdfe979SStefano Zampini   PetscBool            cisdense;
1877ccdfe979SStefano Zampini   PetscScalar          *Bt;
1878ccdfe979SStefano Zampini   Mat                  X;
1879afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1880afb2bd1cSJunchao Zhang   PetscBool            initialized;   /* C = alpha op(A) op(B) + beta C */
1881afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matBDescr;
1882afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t matCDescr;
1883afb2bd1cSJunchao Zhang   size_t               spmmBufferSize;
1884afb2bd1cSJunchao Zhang   void                 *spmmBuffer;
1885afb2bd1cSJunchao Zhang   PetscInt             Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1886afb2bd1cSJunchao Zhang #endif
1887afb2bd1cSJunchao Zhang };
1888ccdfe979SStefano Zampini 
1889ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1890ccdfe979SStefano Zampini {
1891ccdfe979SStefano Zampini   PetscErrorCode ierr;
1892ccdfe979SStefano Zampini   MatMatCusparse *mmdata = (MatMatCusparse *)data;
1893ccdfe979SStefano Zampini   cudaError_t    cerr;
1894ccdfe979SStefano Zampini 
1895ccdfe979SStefano Zampini   PetscFunctionBegin;
1896ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1897afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1898afb2bd1cSJunchao Zhang   cusparseStatus_t stat;
1899afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)  {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);}
1900afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)  {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);}
1901afb2bd1cSJunchao Zhang   if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
1902afb2bd1cSJunchao Zhang  #endif
1903ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1904ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1905ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1906ccdfe979SStefano Zampini }
1907ccdfe979SStefano Zampini 
1908ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1909ccdfe979SStefano Zampini 
1910ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1911ccdfe979SStefano Zampini {
1912ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1913ccdfe979SStefano Zampini   Mat                          A,B;
1914afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1915ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1916ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1917ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1918ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1919ccdfe979SStefano Zampini   const PetscScalar            *barray;
1920ccdfe979SStefano Zampini   PetscScalar                  *carray;
1921ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1922ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1923ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1924ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1925afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1926ccdfe979SStefano Zampini 
1927ccdfe979SStefano Zampini   PetscFunctionBegin;
1928ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1929ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1930ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1931ccdfe979SStefano Zampini   A    = product->A;
1932ccdfe979SStefano Zampini   B    = product->B;
1933ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1934ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1935ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1936ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1937ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1938ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1939ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1940ccdfe979SStefano Zampini   switch (product->type) {
1941ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1942ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1943ccdfe979SStefano Zampini     mat = cusp->mat;
1944ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1945ccdfe979SStefano Zampini     m   = A->rmap->n;
1946ccdfe979SStefano Zampini     n   = B->cmap->n;
1947ccdfe979SStefano Zampini     break;
1948ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1949e6e9a74fSStefano Zampini     if (!cusp->transgen) {
1950e6e9a74fSStefano Zampini       mat = cusp->mat;
1951e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1952e6e9a74fSStefano Zampini     } else {
1953ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
1954ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1955ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1956e6e9a74fSStefano Zampini     }
1957ccdfe979SStefano Zampini     m = A->cmap->n;
1958ccdfe979SStefano Zampini     n = B->cmap->n;
1959ccdfe979SStefano Zampini     break;
1960ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
1961ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
1962ccdfe979SStefano Zampini     mat = cusp->mat;
1963ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1964ccdfe979SStefano Zampini     m   = A->rmap->n;
1965ccdfe979SStefano Zampini     n   = B->rmap->n;
1966ccdfe979SStefano Zampini     break;
1967ccdfe979SStefano Zampini   default:
1968ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
1969ccdfe979SStefano Zampini   }
1970ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
1971ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
1972ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
1973ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
1974afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
1975ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
1976afb2bd1cSJunchao Zhang 
1977ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
1978c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
1979c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
1980c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
1981c8378d12SStefano Zampini   } else {
1982c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
1983c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
1984c8378d12SStefano Zampini   }
1985c8378d12SStefano Zampini 
1986c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1987afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1988afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
1989afb2bd1cSJunchao Zhang   /* (re)allcoate spmmBuffer if not initialized or LDAs are different */
1990afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
1991afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
1992afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
1993afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
1994afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
1995afb2bd1cSJunchao Zhang     }
1996c8378d12SStefano Zampini 
1997afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
1998afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
1999afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2000afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2001afb2bd1cSJunchao Zhang     }
2002afb2bd1cSJunchao Zhang 
2003afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2004afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2005afb2bd1cSJunchao Zhang                               csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2006afb2bd1cSJunchao Zhang                               csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2007afb2bd1cSJunchao Zhang                               csrmat->values->data().get(),
2008afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2009afb2bd1cSJunchao Zhang                               CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2010afb2bd1cSJunchao Zhang     }
2011afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2012afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2013afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2014afb2bd1cSJunchao Zhang                                    cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat);
2015afb2bd1cSJunchao Zhang     if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);}
2016afb2bd1cSJunchao Zhang     cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr);
2017afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2018afb2bd1cSJunchao Zhang   } else {
2019afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2020afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2021afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2022afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2023afb2bd1cSJunchao Zhang   }
2024afb2bd1cSJunchao Zhang 
2025afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2026afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2027afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2028afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2029afb2bd1cSJunchao Zhang                       cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat);
2030afb2bd1cSJunchao Zhang  #else
2031afb2bd1cSJunchao Zhang   PetscInt k;
2032afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2033ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2034ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2035ccdfe979SStefano Zampini     cublasStatus_t cerr;
2036ccdfe979SStefano Zampini 
2037ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2038ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2039ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2040ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2041ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2042ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2043ccdfe979SStefano Zampini     blda = B->cmap->n;
2044afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2045afb2bd1cSJunchao Zhang   } else {
2046afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2047ccdfe979SStefano Zampini   }
2048ccdfe979SStefano Zampini 
2049afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2050ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2051afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2052ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2053ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2054ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2055ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2056ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2057afb2bd1cSJunchao Zhang  #endif
2058afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2059c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2060c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2061ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2062ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2063ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2064ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2065ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2066ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2067ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2068ccdfe979SStefano Zampini   } else {
2069ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2070ccdfe979SStefano Zampini   }
2071ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2072ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2073ccdfe979SStefano Zampini   }
2074ccdfe979SStefano Zampini   if (!biscuda) {
2075ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2076ccdfe979SStefano Zampini   }
2077ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2078ccdfe979SStefano Zampini }
2079ccdfe979SStefano Zampini 
2080ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2081ccdfe979SStefano Zampini {
2082ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2083ccdfe979SStefano Zampini   Mat                A,B;
2084ccdfe979SStefano Zampini   PetscInt           m,n;
2085ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2086ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2087ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2088ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2089ccdfe979SStefano Zampini 
2090ccdfe979SStefano Zampini   PetscFunctionBegin;
2091ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2092ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2093ccdfe979SStefano Zampini   A    = product->A;
2094ccdfe979SStefano Zampini   B    = product->B;
2095ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2096ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2097ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2098ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2099ccdfe979SStefano Zampini   switch (product->type) {
2100ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2101ccdfe979SStefano Zampini     m = A->rmap->n;
2102ccdfe979SStefano Zampini     n = B->cmap->n;
2103ccdfe979SStefano Zampini     break;
2104ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2105ccdfe979SStefano Zampini     m = A->cmap->n;
2106ccdfe979SStefano Zampini     n = B->cmap->n;
2107ccdfe979SStefano Zampini     break;
2108ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2109ccdfe979SStefano Zampini     m = A->rmap->n;
2110ccdfe979SStefano Zampini     n = B->rmap->n;
2111ccdfe979SStefano Zampini     break;
2112ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2113ccdfe979SStefano Zampini     m = B->cmap->n;
2114ccdfe979SStefano Zampini     n = B->cmap->n;
2115ccdfe979SStefano Zampini     break;
2116ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2117ccdfe979SStefano Zampini     m = B->rmap->n;
2118ccdfe979SStefano Zampini     n = B->rmap->n;
2119ccdfe979SStefano Zampini     break;
2120ccdfe979SStefano Zampini   default:
2121ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2122ccdfe979SStefano Zampini   }
2123ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2124ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2125ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2126ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2127ccdfe979SStefano Zampini 
2128ccdfe979SStefano Zampini   /* product data */
2129ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2130ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2131afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2132afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2133ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2134afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2135ccdfe979SStefano Zampini   }
2136afb2bd1cSJunchao Zhang  #endif
2137ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2138ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2139ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2140ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2141ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2142ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2143ccdfe979SStefano Zampini     } else {
2144ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2145ccdfe979SStefano Zampini     }
2146ccdfe979SStefano Zampini   }
2147ccdfe979SStefano Zampini   C->product->data    = mmdata;
2148ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2149ccdfe979SStefano Zampini 
2150ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2151ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2152ccdfe979SStefano Zampini }
2153ccdfe979SStefano Zampini 
2154ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2155ccdfe979SStefano Zampini 
2156ccdfe979SStefano Zampini /* handles dense B */
2157ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C)
2158ccdfe979SStefano Zampini {
2159ccdfe979SStefano Zampini   Mat_Product    *product = C->product;
2160ccdfe979SStefano Zampini   PetscErrorCode ierr;
2161ccdfe979SStefano Zampini 
2162ccdfe979SStefano Zampini   PetscFunctionBegin;
2163ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2164ccdfe979SStefano Zampini   if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A");
2165ccdfe979SStefano Zampini   if (product->A->boundtocpu) {
2166ccdfe979SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr);
2167ccdfe979SStefano Zampini     PetscFunctionReturn(0);
2168ccdfe979SStefano Zampini   }
2169ccdfe979SStefano Zampini   switch (product->type) {
2170ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2171ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2172ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2173ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2174ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2175ccdfe979SStefano Zampini     C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2176ccdfe979SStefano Zampini   default:
2177ccdfe979SStefano Zampini     break;
2178ccdfe979SStefano Zampini   }
2179ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2180ccdfe979SStefano Zampini }
2181ccdfe979SStefano Zampini 
21826fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
21839ae82921SPaul Mullowney {
2184b175d8bbSPaul Mullowney   PetscErrorCode ierr;
21859ae82921SPaul Mullowney 
21869ae82921SPaul Mullowney   PetscFunctionBegin;
2187e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2188e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2189e6e9a74fSStefano Zampini }
2190e6e9a74fSStefano Zampini 
2191e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2192e6e9a74fSStefano Zampini {
2193e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2194e6e9a74fSStefano Zampini 
2195e6e9a74fSStefano Zampini   PetscFunctionBegin;
2196e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2197e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2198e6e9a74fSStefano Zampini }
2199e6e9a74fSStefano Zampini 
2200e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2201e6e9a74fSStefano Zampini {
2202e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2203e6e9a74fSStefano Zampini 
2204e6e9a74fSStefano Zampini   PetscFunctionBegin;
2205e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2206e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2207e6e9a74fSStefano Zampini }
2208e6e9a74fSStefano Zampini 
2209e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2210e6e9a74fSStefano Zampini {
2211e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2212e6e9a74fSStefano Zampini 
2213e6e9a74fSStefano Zampini   PetscFunctionBegin;
2214e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
22159ae82921SPaul Mullowney   PetscFunctionReturn(0);
22169ae82921SPaul Mullowney }
22179ae82921SPaul Mullowney 
22186fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2219ca45077fSPaul Mullowney {
2220b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2221ca45077fSPaul Mullowney 
2222ca45077fSPaul Mullowney   PetscFunctionBegin;
2223e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2224ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2225ca45077fSPaul Mullowney }
2226ca45077fSPaul Mullowney 
2227afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2228e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
22299ae82921SPaul Mullowney {
22309ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2231aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
22329ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2233e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2234b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
223557d48284SJunchao Zhang   cudaError_t                  cerr;
2236aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2237e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2238e6e9a74fSStefano Zampini   PetscBool                    compressed;
2239afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2240afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2241afb2bd1cSJunchao Zhang #endif
22426e111a19SKarl Rupp 
22439ae82921SPaul Mullowney   PetscFunctionBegin;
2244e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2245e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2246afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2247d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2248e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2249e6e9a74fSStefano Zampini   }
225034d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
225134d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2252e6e9a74fSStefano Zampini   if (!trans) {
22539ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2254c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2255e6e9a74fSStefano Zampini   } else {
2256e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2257e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2258e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2259e6e9a74fSStefano Zampini     } else {
2260afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2261e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2262e6e9a74fSStefano Zampini     }
2263e6e9a74fSStefano Zampini   }
2264e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2265e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2266213423ffSJunchao Zhang 
2267e6e9a74fSStefano Zampini   try {
2268e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2269213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2270213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2271afb2bd1cSJunchao Zhang 
227285ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2273e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2274afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2275afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2276afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2277afb2bd1cSJunchao Zhang       */
2278e6e9a74fSStefano Zampini       xptr = xarray;
2279afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2280213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2281afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2282afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2283afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2284afb2bd1cSJunchao Zhang        */
2285afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2286afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2287afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2288afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2289afb2bd1cSJunchao Zhang       }
2290afb2bd1cSJunchao Zhang      #endif
2291e6e9a74fSStefano Zampini     } else {
2292afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2293afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2294afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2295afb2bd1cSJunchao Zhang        */
2296afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2297e6e9a74fSStefano Zampini       dptr = zarray;
2298e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2299afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2300e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2301e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2302e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2303e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2304e6e9a74fSStefano Zampini       }
2305afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2306afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2307afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2308afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2309afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2310afb2bd1cSJunchao Zhang       }
2311afb2bd1cSJunchao Zhang      #endif
2312e6e9a74fSStefano Zampini     }
23139ae82921SPaul Mullowney 
2314afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2315aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2316afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2317afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2318afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2319afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2320afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2321afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2322afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2323afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2324afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2325afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2326afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2327afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2328afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2329afb2bd1cSJunchao Zhang 
2330afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2331afb2bd1cSJunchao Zhang       } else {
2332afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2333afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2334afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2335afb2bd1cSJunchao Zhang       }
2336afb2bd1cSJunchao Zhang 
2337afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2338afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2339afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2340afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2341afb2bd1cSJunchao Zhang                                beta,
2342afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2343afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2344afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2345afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2346afb2bd1cSJunchao Zhang      #else
23477656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2348e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2349a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2350afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2351aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2352e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
235357d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2354afb2bd1cSJunchao Zhang      #endif
2355aa372e3fSPaul Mullowney     } else {
2356213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2357afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2358afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2359afb2bd1cSJunchao Zhang        #else
2360301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2361e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2362afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2363e6e9a74fSStefano Zampini                                  xptr, beta,
236457d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2365afb2bd1cSJunchao Zhang        #endif
2366a65300a6SPaul Mullowney       }
2367aa372e3fSPaul Mullowney     }
236805035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2369958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2370aa372e3fSPaul Mullowney 
2371e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2372213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2373213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2374213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2375e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2376213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
23777656d835SStefano Zampini         }
2378213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2379c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
23807656d835SStefano Zampini       }
23817656d835SStefano Zampini 
2382213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2383213423ffSJunchao Zhang       if (compressed) {
2384213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2385e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2386c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2387e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2388c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
238905035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2390958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2391e6e9a74fSStefano Zampini       }
2392e6e9a74fSStefano Zampini     } else {
2393e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2394e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2395e6e9a74fSStefano Zampini       }
2396e6e9a74fSStefano Zampini     }
2397e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2398213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2399213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
24009ae82921SPaul Mullowney   } catch(char *ex) {
24019ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
24029ae82921SPaul Mullowney   }
2403e6e9a74fSStefano Zampini   if (yy) {
2404958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2405e6e9a74fSStefano Zampini   } else {
2406e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2407e6e9a74fSStefano Zampini   }
24089ae82921SPaul Mullowney   PetscFunctionReturn(0);
24099ae82921SPaul Mullowney }
24109ae82921SPaul Mullowney 
24116fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2412ca45077fSPaul Mullowney {
2413b175d8bbSPaul Mullowney   PetscErrorCode ierr;
24146e111a19SKarl Rupp 
2415ca45077fSPaul Mullowney   PetscFunctionBegin;
2416e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2417ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2418ca45077fSPaul Mullowney }
2419ca45077fSPaul Mullowney 
24206fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
24219ae82921SPaul Mullowney {
24229ae82921SPaul Mullowney   PetscErrorCode              ierr;
24233fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL, h_mat;
24243fa6b06aSMark Adams   PetscBool                   is_seq = PETSC_TRUE;
24253fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
24269ae82921SPaul Mullowney   PetscFunctionBegin;
2427bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
24283fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2429bc3f50f2SPaul Mullowney   }
24303fa6b06aSMark Adams   if (d_mat) {
24313fa6b06aSMark Adams     cudaError_t err;
24323fa6b06aSMark Adams     ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr);
24333fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
24343fa6b06aSMark Adams     nnz_state = h_mat.nonzerostate;
24353fa6b06aSMark Adams     is_seq = h_mat.seq;
24363fa6b06aSMark Adams   }
24373fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
24383fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
24393fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU
24403fa6b06aSMark Adams     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
24413fa6b06aSMark Adams   } else if (nnz_state > A->nonzerostate) {
24423fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
24433fa6b06aSMark Adams   }
24443fa6b06aSMark Adams 
24459ae82921SPaul Mullowney   PetscFunctionReturn(0);
24469ae82921SPaul Mullowney }
24479ae82921SPaul Mullowney 
24489ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2449e057df02SPaul Mullowney /*@
24509ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2451e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2452e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2453e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2454e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2455e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
24569ae82921SPaul Mullowney 
2457d083f849SBarry Smith    Collective
24589ae82921SPaul Mullowney 
24599ae82921SPaul Mullowney    Input Parameters:
24609ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
24619ae82921SPaul Mullowney .  m - number of rows
24629ae82921SPaul Mullowney .  n - number of columns
24639ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
24649ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
24650298fd71SBarry Smith          (possibly different for each row) or NULL
24669ae82921SPaul Mullowney 
24679ae82921SPaul Mullowney    Output Parameter:
24689ae82921SPaul Mullowney .  A - the matrix
24699ae82921SPaul Mullowney 
24709ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
24719ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
24729ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
24739ae82921SPaul Mullowney 
24749ae82921SPaul Mullowney    Notes:
24759ae82921SPaul Mullowney    If nnz is given then nz is ignored
24769ae82921SPaul Mullowney 
24779ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
24789ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
24799ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
24809ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
24819ae82921SPaul Mullowney 
24829ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
24830298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
24849ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
24859ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
24869ae82921SPaul Mullowney 
24879ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
24889ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
24899ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
24909ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
24919ae82921SPaul Mullowney 
24929ae82921SPaul Mullowney    Level: intermediate
24939ae82921SPaul Mullowney 
2494e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
24959ae82921SPaul Mullowney @*/
24969ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
24979ae82921SPaul Mullowney {
24989ae82921SPaul Mullowney   PetscErrorCode ierr;
24999ae82921SPaul Mullowney 
25009ae82921SPaul Mullowney   PetscFunctionBegin;
25019ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
25029ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
25039ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
25049ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
25059ae82921SPaul Mullowney   PetscFunctionReturn(0);
25069ae82921SPaul Mullowney }
25079ae82921SPaul Mullowney 
25086fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
25099ae82921SPaul Mullowney {
25109ae82921SPaul Mullowney   PetscErrorCode              ierr;
25113fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
2512ab25e6cbSDominic Meiser 
25139ae82921SPaul Mullowney   PetscFunctionBegin;
25149ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
25153fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
25163fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
2517470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
25189ae82921SPaul Mullowney   } else {
2519470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
2520aa372e3fSPaul Mullowney   }
25213fa6b06aSMark Adams   if (d_mat) {
25223fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
25233fa6b06aSMark Adams     cudaError_t                err;
25243fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
25253fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
25263fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
25273fa6b06aSMark Adams     if (h_mat.seq) {
25283fa6b06aSMark Adams       if (a->compressedrow.use) {
25293fa6b06aSMark Adams  	err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
25303fa6b06aSMark Adams       }
25313fa6b06aSMark Adams       err = cudaFree(d_mat);CHKERRCUDA(err);
25323fa6b06aSMark Adams     }
25333fa6b06aSMark Adams   }
2534ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
2535ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2536ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
2537ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
25387e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
25397e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
25409ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
25419ae82921SPaul Mullowney   PetscFunctionReturn(0);
25429ae82921SPaul Mullowney }
25439ae82921SPaul Mullowney 
2544ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
254595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
25469ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
25479ff858a8SKarl Rupp {
25489ff858a8SKarl Rupp   PetscErrorCode ierr;
25499ff858a8SKarl Rupp 
25509ff858a8SKarl Rupp   PetscFunctionBegin;
25519ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
2552ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
25539ff858a8SKarl Rupp   PetscFunctionReturn(0);
25549ff858a8SKarl Rupp }
25559ff858a8SKarl Rupp 
255695639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
255795639643SRichard Tran Mills {
2558c58ef05eSStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
2559e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2560e6e9a74fSStefano Zampini 
256195639643SRichard Tran Mills   PetscFunctionBegin;
2562e6e9a74fSStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
256395639643SRichard Tran Mills   if (flg) {
25647e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
25657e8381f9SStefano Zampini 
256695639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJ;
256795639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJ;
2568c34f1ff0SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
2569c34f1ff0SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
2570e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = NULL;
2571e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = NULL;
2572e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
2573e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
25747e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
25757e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
25767e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
257795639643SRichard Tran Mills   } else {
257895639643SRichard Tran Mills     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
257995639643SRichard Tran Mills     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
258095639643SRichard Tran Mills     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
258195639643SRichard Tran Mills     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
2582e6e9a74fSStefano Zampini     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
2583e6e9a74fSStefano Zampini     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
2584e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
2585e6e9a74fSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
25867e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
25877e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
25887e8381f9SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
258995639643SRichard Tran Mills   }
259095639643SRichard Tran Mills   A->boundtocpu = flg;
2591c58ef05eSStefano Zampini   a->inode.use = flg;
259295639643SRichard Tran Mills   PetscFunctionReturn(0);
259395639643SRichard Tran Mills }
259495639643SRichard Tran Mills 
25953fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
25963fa6b06aSMark Adams {
25973fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = NULL;
25983fa6b06aSMark Adams   PetscErrorCode             ierr;
25997e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
26007e8381f9SStefano Zampini 
26013fa6b06aSMark Adams   PetscFunctionBegin;
26023fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
26033fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
26047e8381f9SStefano Zampini     if (spptr->mat) {
26057e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
26067e8381f9SStefano Zampini       if (matrix->values) {
26077e8381f9SStefano Zampini         both = PETSC_TRUE;
26087e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
26097e8381f9SStefano Zampini       }
26107e8381f9SStefano Zampini     }
26117e8381f9SStefano Zampini     if (spptr->matTranspose) {
26127e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
26137e8381f9SStefano Zampini       if (matrix->values) {
26147e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
26157e8381f9SStefano Zampini       }
26167e8381f9SStefano Zampini     }
26173fa6b06aSMark Adams     d_mat = spptr->deviceMat;
26183fa6b06aSMark Adams   }
26193fa6b06aSMark Adams   if (d_mat) {
26203fa6b06aSMark Adams     Mat_SeqAIJ   *a = (Mat_SeqAIJ*)A->data;
26213fa6b06aSMark Adams     PetscInt     n = A->rmap->n, nnz = a->i[n];
26223fa6b06aSMark Adams     cudaError_t  err;
26233fa6b06aSMark Adams     PetscScalar  *vals;
26243fa6b06aSMark Adams     ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr);
26253fa6b06aSMark Adams     err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
26263fa6b06aSMark Adams     err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err);
26273fa6b06aSMark Adams   }
26283fa6b06aSMark Adams   ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
26297e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
26303fa6b06aSMark Adams 
26313fa6b06aSMark Adams   PetscFunctionReturn(0);
26323fa6b06aSMark Adams }
26333fa6b06aSMark Adams 
263449735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
26359ae82921SPaul Mullowney {
26369ae82921SPaul Mullowney   PetscErrorCode   ierr;
2637aa372e3fSPaul Mullowney   cusparseStatus_t stat;
263849735bf3SStefano Zampini   Mat              B;
26399ae82921SPaul Mullowney 
26409ae82921SPaul Mullowney   PetscFunctionBegin;
264149735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
264249735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
264349735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
264449735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
264549735bf3SStefano Zampini   }
264649735bf3SStefano Zampini   B = *newmat;
264749735bf3SStefano Zampini 
264834136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
264934136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
265034136279SStefano Zampini 
265149735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
26529ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
2653e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
2654e6e9a74fSStefano Zampini 
2655e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2656e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
2657e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2658e6e9a74fSStefano Zampini       B->spptr = spptr;
26593fa6b06aSMark Adams       spptr->deviceMat = NULL;
26609ae82921SPaul Mullowney     } else {
2661e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
2662e6e9a74fSStefano Zampini 
2663e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
2664e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
2665e6e9a74fSStefano Zampini       B->spptr = spptr;
26669ae82921SPaul Mullowney     }
2667e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
266849735bf3SStefano Zampini   }
2669693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
26709ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
26719ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
267295639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
2673693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
26743fa6b06aSMark Adams   B->ops->zeroentries    = MatZeroEntries_SeqAIJCUSPARSE;
26752205254eSKarl Rupp 
2676e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
26779ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2678bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
26799ae82921SPaul Mullowney   PetscFunctionReturn(0);
26809ae82921SPaul Mullowney }
26819ae82921SPaul Mullowney 
268202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
268302fe1965SBarry Smith {
268402fe1965SBarry Smith   PetscErrorCode ierr;
268502fe1965SBarry Smith 
268602fe1965SBarry Smith   PetscFunctionBegin;
268705035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
268802fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
26890ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2690afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
2691afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
2692afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
269302fe1965SBarry Smith   PetscFunctionReturn(0);
269402fe1965SBarry Smith }
269502fe1965SBarry Smith 
26963ca39a21SBarry Smith /*MC
2697e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
2698e057df02SPaul Mullowney 
2699e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
27002692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
27012692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
2702e057df02SPaul Mullowney 
2703e057df02SPaul Mullowney    Options Database Keys:
2704e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
2705aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2706a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
2707e057df02SPaul Mullowney 
2708e057df02SPaul Mullowney   Level: beginner
2709e057df02SPaul Mullowney 
27108468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
2711e057df02SPaul Mullowney M*/
27127f756511SDominic Meiser 
271342c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
271442c9c57cSBarry Smith 
27150f39cd5aSBarry Smith 
27163ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
271742c9c57cSBarry Smith {
271842c9c57cSBarry Smith   PetscErrorCode ierr;
271942c9c57cSBarry Smith 
272042c9c57cSBarry Smith   PetscFunctionBegin;
27213ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27223ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27233ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
27243ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
272542c9c57cSBarry Smith   PetscFunctionReturn(0);
272642c9c57cSBarry Smith }
272729b38603SBarry Smith 
2728470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
27297f756511SDominic Meiser {
2730e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
27317f756511SDominic Meiser   cusparseStatus_t stat;
27327f756511SDominic Meiser 
27337f756511SDominic Meiser   PetscFunctionBegin;
27347f756511SDominic Meiser   if (*cusparsestruct) {
2735e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
2736e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
27377f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
273881902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
27397e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
27407e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
27417e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_v;
27427e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_w;
27437e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
2744afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2745afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr);
2746afb2bd1cSJunchao Zhang    #endif
2747e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
27487f756511SDominic Meiser   }
27497f756511SDominic Meiser   PetscFunctionReturn(0);
27507f756511SDominic Meiser }
27517f756511SDominic Meiser 
27527f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
27537f756511SDominic Meiser {
27547f756511SDominic Meiser   PetscFunctionBegin;
27557f756511SDominic Meiser   if (*mat) {
27567f756511SDominic Meiser     delete (*mat)->values;
27577f756511SDominic Meiser     delete (*mat)->column_indices;
27587f756511SDominic Meiser     delete (*mat)->row_offsets;
27597f756511SDominic Meiser     delete *mat;
27607f756511SDominic Meiser     *mat = 0;
27617f756511SDominic Meiser   }
27627f756511SDominic Meiser   PetscFunctionReturn(0);
27637f756511SDominic Meiser }
27647f756511SDominic Meiser 
2765470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
27667f756511SDominic Meiser {
27677f756511SDominic Meiser   cusparseStatus_t stat;
27687f756511SDominic Meiser   PetscErrorCode   ierr;
27697f756511SDominic Meiser 
27707f756511SDominic Meiser   PetscFunctionBegin;
27717f756511SDominic Meiser   if (*trifactor) {
277257d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
2773afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
27747f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
27751b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
2776*2cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
2777afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
27781b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
2779afb2bd1cSJunchao Zhang    #endif
2780da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
27817f756511SDominic Meiser   }
27827f756511SDominic Meiser   PetscFunctionReturn(0);
27837f756511SDominic Meiser }
27847f756511SDominic Meiser 
2785470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
27867f756511SDominic Meiser {
27877f756511SDominic Meiser   CsrMatrix        *mat;
27887f756511SDominic Meiser   cusparseStatus_t stat;
27897f756511SDominic Meiser   cudaError_t      err;
27907f756511SDominic Meiser 
27917f756511SDominic Meiser   PetscFunctionBegin;
27927f756511SDominic Meiser   if (*matstruct) {
27937f756511SDominic Meiser     if ((*matstruct)->mat) {
27947f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
2795afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2796afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2797afb2bd1cSJunchao Zhang        #else
27987f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
279957d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
2800afb2bd1cSJunchao Zhang        #endif
28017f756511SDominic Meiser       } else {
28027f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
28037f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
28047f756511SDominic Meiser       }
28057f756511SDominic Meiser     }
280657d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
28077f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
2808afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
28097656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
28107656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
2811afb2bd1cSJunchao Zhang 
2812afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2813afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
2814afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
2815afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
2816afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
2817afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
2818afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
2819afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
2820afb2bd1cSJunchao Zhang       }
2821afb2bd1cSJunchao Zhang     }
2822afb2bd1cSJunchao Zhang    #endif
28237f756511SDominic Meiser     delete *matstruct;
28247e8381f9SStefano Zampini     *matstruct = NULL;
28257f756511SDominic Meiser   }
28267f756511SDominic Meiser   PetscFunctionReturn(0);
28277f756511SDominic Meiser }
28287f756511SDominic Meiser 
2829ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
28307f756511SDominic Meiser {
2831e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2832e6e9a74fSStefano Zampini 
28337f756511SDominic Meiser   PetscFunctionBegin;
28347f756511SDominic Meiser   if (*trifactors) {
2835e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
2836e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
2837e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
2838e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
28397f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
28407f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
28417f756511SDominic Meiser     delete (*trifactors)->workVector;
28427e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
28437e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
28447e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
2845ccdfe979SStefano Zampini   }
2846ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2847ccdfe979SStefano Zampini }
2848ccdfe979SStefano Zampini 
2849ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
2850ccdfe979SStefano Zampini {
2851e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
2852ccdfe979SStefano Zampini   cusparseHandle_t handle;
2853ccdfe979SStefano Zampini   cusparseStatus_t stat;
2854ccdfe979SStefano Zampini 
2855ccdfe979SStefano Zampini   PetscFunctionBegin;
2856ccdfe979SStefano Zampini   if (*trifactors) {
2857e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
28587f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
285957d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
28607f756511SDominic Meiser     }
2861e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
28627f756511SDominic Meiser   }
28637f756511SDominic Meiser   PetscFunctionReturn(0);
28647f756511SDominic Meiser }
28657e8381f9SStefano Zampini 
28667e8381f9SStefano Zampini struct IJCompare
28677e8381f9SStefano Zampini {
28687e8381f9SStefano Zampini   __host__ __device__
28697e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
28707e8381f9SStefano Zampini   {
28717e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
28727e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
28737e8381f9SStefano Zampini     return false;
28747e8381f9SStefano Zampini   }
28757e8381f9SStefano Zampini };
28767e8381f9SStefano Zampini 
28777e8381f9SStefano Zampini struct IJEqual
28787e8381f9SStefano Zampini {
28797e8381f9SStefano Zampini   __host__ __device__
28807e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
28817e8381f9SStefano Zampini   {
28827e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
28837e8381f9SStefano Zampini     return true;
28847e8381f9SStefano Zampini   }
28857e8381f9SStefano Zampini };
28867e8381f9SStefano Zampini 
28877e8381f9SStefano Zampini struct IJDiff
28887e8381f9SStefano Zampini {
28897e8381f9SStefano Zampini   __host__ __device__
28907e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
28917e8381f9SStefano Zampini   {
28927e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
28937e8381f9SStefano Zampini   }
28947e8381f9SStefano Zampini };
28957e8381f9SStefano Zampini 
28967e8381f9SStefano Zampini struct IJSum
28977e8381f9SStefano Zampini {
28987e8381f9SStefano Zampini   __host__ __device__
28997e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
29007e8381f9SStefano Zampini   {
29017e8381f9SStefano Zampini     return t1||t2;
29027e8381f9SStefano Zampini   }
29037e8381f9SStefano Zampini };
29047e8381f9SStefano Zampini 
29057e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
29067e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
29077e8381f9SStefano Zampini {
29087e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
29097e8381f9SStefano Zampini   CsrMatrix          *matrix;
29107e8381f9SStefano Zampini   PetscErrorCode     ierr;
29117e8381f9SStefano Zampini   cudaError_t        cerr;
29127e8381f9SStefano Zampini   PetscInt           n;
29137e8381f9SStefano Zampini 
29147e8381f9SStefano Zampini   PetscFunctionBegin;
29157e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
29167e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
29177e8381f9SStefano Zampini   if (!cusp->cooPerm) {
29187e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29197e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29207e8381f9SStefano Zampini     PetscFunctionReturn(0);
29217e8381f9SStefano Zampini   }
29227e8381f9SStefano Zampini   n = cusp->cooPerm->size();
29237e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
29247e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
29257e8381f9SStefano Zampini   if (!cusp->cooPerm_v) { cusp->cooPerm_v = new THRUSTARRAY(n); }
29267e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
29277e8381f9SStefano Zampini   if (v) {
29287e8381f9SStefano Zampini     cusp->cooPerm_v->assign(v,v+n);
29297e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
29307e8381f9SStefano Zampini   }
29317e8381f9SStefano Zampini   else thrust::fill(thrust::device,cusp->cooPerm_v->begin(),cusp->cooPerm_v->end(),0.);
29327e8381f9SStefano Zampini   if (imode == ADD_VALUES) {
29337e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
29347e8381f9SStefano Zampini       if (!cusp->cooPerm_w) cusp->cooPerm_w = new THRUSTARRAY(matrix->values->size());
29357e8381f9SStefano Zampini       auto vbit = thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin());
29367e8381f9SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cusp->cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
29377e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_w->begin(),cusp->cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
29387e8381f9SStefano Zampini     } else {
29397e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
29407e8381f9SStefano Zampini                                                                 matrix->values->begin()));
29417e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
29427e8381f9SStefano Zampini                                                                 matrix->values->end()));
29437e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
29447e8381f9SStefano Zampini     }
29457e8381f9SStefano Zampini   } else {
29467e8381f9SStefano Zampini     if (cusp->cooPerm_a) { /* non unique values insertion, result is undefined (we cannot guarantee last takes precedence)
29477e8381f9SStefano Zampini                               if we are inserting two different values into the same location */
29487e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
29497e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->begin())));
29507e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
29517e8381f9SStefano Zampini                                                                 thrust::make_permutation_iterator(matrix->values->begin(),cusp->cooPerm_a->end())));
29527e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
29537e8381f9SStefano Zampini     } else {
29547e8381f9SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->begin()),
29557e8381f9SStefano Zampini                                                                 matrix->values->begin()));
29567e8381f9SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(cusp->cooPerm_v->begin(),cusp->cooPerm->end()),
29577e8381f9SStefano Zampini                                                                 matrix->values->end()));
29587e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
29597e8381f9SStefano Zampini     }
29607e8381f9SStefano Zampini   }
29617e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
29627e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
29637e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
29647e8381f9SStefano Zampini   ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29657e8381f9SStefano Zampini   ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
29667e8381f9SStefano Zampini   /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */
29677e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
29687e8381f9SStefano Zampini   PetscFunctionReturn(0);
29697e8381f9SStefano Zampini }
29707e8381f9SStefano Zampini 
29717e8381f9SStefano Zampini #include <thrust/binary_search.h>
29727e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
29737e8381f9SStefano Zampini {
29747e8381f9SStefano Zampini   PetscErrorCode     ierr;
29757e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
29767e8381f9SStefano Zampini   CsrMatrix          *matrix;
29777e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
29787e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
29797e8381f9SStefano Zampini   cudaError_t        cerr;
29807e8381f9SStefano Zampini 
29817e8381f9SStefano Zampini   PetscFunctionBegin;
29827e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
29837e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
29847e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
29857e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
29867e8381f9SStefano Zampini   if (n != cooPerm_n) {
29877e8381f9SStefano Zampini     delete cusp->cooPerm;
29887e8381f9SStefano Zampini     delete cusp->cooPerm_v;
29897e8381f9SStefano Zampini     delete cusp->cooPerm_w;
29907e8381f9SStefano Zampini     delete cusp->cooPerm_a;
29917e8381f9SStefano Zampini     cusp->cooPerm = NULL;
29927e8381f9SStefano Zampini     cusp->cooPerm_v = NULL;
29937e8381f9SStefano Zampini     cusp->cooPerm_w = NULL;
29947e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
29957e8381f9SStefano Zampini   }
29967e8381f9SStefano Zampini   if (n) {
29977e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
29987e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
29997e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
30007e8381f9SStefano Zampini 
30017e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
30027e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
30037e8381f9SStefano Zampini 
30047e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
30057e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
30067e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
30077e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
30087e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
30097e8381f9SStefano Zampini 
30107e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
30117e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
30127e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
30137e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
30147e8381f9SStefano Zampini 
30157e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
30167e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
30177e8381f9SStefano Zampini       delete cusp->cooPerm_a;
30187e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
30197e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
30207e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
30217e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
30227e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
30237e8381f9SStefano Zampini       w[0] = 0;
30247e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
30257e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
30267e8381f9SStefano Zampini     }
30277e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
30287e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
30297e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
30307e8381f9SStefano Zampini                         ii.begin());
30317e8381f9SStefano Zampini 
30327e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
30337e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
30347e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
30357e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
30367e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
30377e8381f9SStefano Zampini     a->i[0] = 0;
30387e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
30397e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
30407e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
30417e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
30427e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
30437e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
30447e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
30457e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
30467e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
30477e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
30487e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
30497e8381f9SStefano Zampini     }
30507e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
30517e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
30527e8381f9SStefano Zampini   } else {
30537e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
30547e8381f9SStefano Zampini   }
30557e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
30567e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr);
30577e8381f9SStefano Zampini 
30587e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
30597e8381f9SStefano Zampini      The code is so convoluted now that I prefer to copy garbage to the GPU */
30607e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
30617e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
30627e8381f9SStefano Zampini   A->nonzerostate++;
30637e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
30647e8381f9SStefano Zampini   {
30657e8381f9SStefano Zampini     matrix = (CsrMatrix*)cusp->mat->mat;
30667e8381f9SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
30677e8381f9SStefano Zampini     thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
30687e8381f9SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
30697e8381f9SStefano Zampini   }
30707e8381f9SStefano Zampini 
30717e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
30727e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
30737e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
30747e8381f9SStefano Zampini   PetscFunctionReturn(0);
30757e8381f9SStefano Zampini }
3076