xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision e8d2b73a75a25c9e078f3db3f3d2727a3946c772)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17*e8d2b73aSMark Adams 
18e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22afb2bd1cSJunchao Zhang 
23afb2bd1cSJunchao Zhang   typedef enum {
24afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
25afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
27afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
28afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
43afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
44afb2bd1cSJunchao Zhang 
45afb2bd1cSJunchao Zhang   typedef enum {
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
49afb2bd1cSJunchao Zhang   */
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53afb2bd1cSJunchao Zhang #endif
549ae82921SPaul Mullowney 
55087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58087f3262SPaul Mullowney 
596fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
616fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
666fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
674416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
706fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
736fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
779ae82921SPaul Mullowney 
787f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
837f756511SDominic Meiser 
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
86a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8757181aedSStefano Zampini 
887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
897e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
907e8381f9SStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92c215019aSStefano Zampini 
93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
94b06137fdSPaul Mullowney {
95b06137fdSPaul Mullowney   cusparseStatus_t   stat;
96b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97b06137fdSPaul Mullowney 
98b06137fdSPaul Mullowney   PetscFunctionBegin;
99d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10157d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102b06137fdSPaul Mullowney   PetscFunctionReturn(0);
103b06137fdSPaul Mullowney }
104b06137fdSPaul Mullowney 
105b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106b06137fdSPaul Mullowney {
107b06137fdSPaul Mullowney   cusparseStatus_t   stat;
108b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109b06137fdSPaul Mullowney 
110b06137fdSPaul Mullowney   PetscFunctionBegin;
111d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1126b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11316a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11457d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11516a2e217SAlejandro Lamas Daviña     }
116b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1176b1cf21dSAlejandro Lamas Daviña   }
11857d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119b06137fdSPaul Mullowney   PetscFunctionReturn(0);
120b06137fdSPaul Mullowney }
121b06137fdSPaul Mullowney 
122b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123b06137fdSPaul Mullowney {
124b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1257e8381f9SStefano Zampini   PetscBool          flg;
1267e8381f9SStefano Zampini   PetscErrorCode     ierr;
127ccdfe979SStefano Zampini 
128b06137fdSPaul Mullowney   PetscFunctionBegin;
1297e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1307e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
131ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132b06137fdSPaul Mullowney   PetscFunctionReturn(0);
133b06137fdSPaul Mullowney }
134b06137fdSPaul Mullowney 
135ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1369ae82921SPaul Mullowney {
1379ae82921SPaul Mullowney   PetscFunctionBegin;
1389ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1399ae82921SPaul Mullowney   PetscFunctionReturn(0);
1409ae82921SPaul Mullowney }
1419ae82921SPaul Mullowney 
142c708e6cdSJed Brown /*MC
143087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
149c708e6cdSJed Brown 
1509ae82921SPaul Mullowney   Level: beginner
151c708e6cdSJed Brown 
1523ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153c708e6cdSJed Brown M*/
1549ae82921SPaul Mullowney 
15542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1569ae82921SPaul Mullowney {
1579ae82921SPaul Mullowney   PetscErrorCode ierr;
158bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1599ae82921SPaul Mullowney 
1609ae82921SPaul Mullowney   PetscFunctionBegin;
161bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
162bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1632c7c0729SBarry Smith   (*B)->factortype = ftype;
1649ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1652205254eSKarl Rupp 
166087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16733d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1689ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1699ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1704ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1714ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1724ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
173087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
174087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
175087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1764ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1774ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1789ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
179bc3f50f2SPaul Mullowney 
180fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1814ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1823ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1839ae82921SPaul Mullowney   PetscFunctionReturn(0);
1849ae82921SPaul Mullowney }
1859ae82921SPaul Mullowney 
186bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
187ca45077fSPaul Mullowney {
188aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1896e111a19SKarl Rupp 
190ca45077fSPaul Mullowney   PetscFunctionBegin;
191ca45077fSPaul Mullowney   switch (op) {
192e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
193aa372e3fSPaul Mullowney     cusparsestruct->format = format;
194ca45077fSPaul Mullowney     break;
195e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
196aa372e3fSPaul Mullowney     cusparsestruct->format = format;
197ca45077fSPaul Mullowney     break;
198ca45077fSPaul Mullowney   default:
19936d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
200ca45077fSPaul Mullowney   }
201ca45077fSPaul Mullowney   PetscFunctionReturn(0);
202ca45077fSPaul Mullowney }
2039ae82921SPaul Mullowney 
204e057df02SPaul Mullowney /*@
205e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
206e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
207aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
208e057df02SPaul Mullowney    Not Collective
209e057df02SPaul Mullowney 
210e057df02SPaul Mullowney    Input Parameters:
2118468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
21236d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2132692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
214e057df02SPaul Mullowney 
215e057df02SPaul Mullowney    Output Parameter:
216e057df02SPaul Mullowney 
217e057df02SPaul Mullowney    Level: intermediate
218e057df02SPaul Mullowney 
2198468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
220e057df02SPaul Mullowney @*/
221e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
222e057df02SPaul Mullowney {
223e057df02SPaul Mullowney   PetscErrorCode ierr;
2246e111a19SKarl Rupp 
225e057df02SPaul Mullowney   PetscFunctionBegin;
226e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
227e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
228e057df02SPaul Mullowney   PetscFunctionReturn(0);
229e057df02SPaul Mullowney }
230e057df02SPaul Mullowney 
2311a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
232e6e9a74fSStefano Zampini {
233e6e9a74fSStefano Zampini   PetscErrorCode ierr;
234e6e9a74fSStefano Zampini 
235e6e9a74fSStefano Zampini   PetscFunctionBegin;
2361a2c6b5cSJunchao Zhang   switch (op) {
2371a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2381a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2391a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2401a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2411a2c6b5cSJunchao Zhang       break;
2421a2c6b5cSJunchao Zhang     default:
2431a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2441a2c6b5cSJunchao Zhang       break;
245e6e9a74fSStefano Zampini   }
246e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
247e6e9a74fSStefano Zampini }
248e6e9a74fSStefano Zampini 
249bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
250bddcd29dSMark Adams 
251bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
252bddcd29dSMark Adams {
253bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
254bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
255bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
256bddcd29dSMark Adams   PetscErrorCode ierr;
257bddcd29dSMark Adams 
258bddcd29dSMark Adams   PetscFunctionBegin;
259bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
260bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
261bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
262bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
263bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
264bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
265bddcd29dSMark Adams   if (row_identity && col_identity) {
266bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
267bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
268bddcd29dSMark Adams     B->ops->matsolve = NULL;
269bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
270bddcd29dSMark Adams   } else {
271bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
272bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
273bddcd29dSMark Adams     B->ops->matsolve = NULL;
274bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
275bddcd29dSMark Adams   }
276bddcd29dSMark Adams 
277bddcd29dSMark Adams   /* get the triangular factors */
278bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
279bddcd29dSMark Adams   PetscFunctionReturn(0);
280bddcd29dSMark Adams }
281bddcd29dSMark Adams 
2824416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2839ae82921SPaul Mullowney {
2849ae82921SPaul Mullowney   PetscErrorCode           ierr;
285e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2869ae82921SPaul Mullowney   PetscBool                flg;
287a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2886e111a19SKarl Rupp 
2899ae82921SPaul Mullowney   PetscFunctionBegin;
290e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2919ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
292e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
293a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
294afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
295afb2bd1cSJunchao Zhang 
2964c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
297a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
298afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
299afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
300afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
301afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
302afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
303afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
304afb2bd1cSJunchao Zhang 
305afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
306afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
307afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
308afb2bd1cSJunchao Zhang 
309afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
310afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
311afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
312afb2bd1cSJunchao Zhang    #endif
3134c87dfd4SPaul Mullowney   }
3140af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3159ae82921SPaul Mullowney   PetscFunctionReturn(0);
3169ae82921SPaul Mullowney }
3179ae82921SPaul Mullowney 
3186fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3199ae82921SPaul Mullowney {
320da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3219ae82921SPaul Mullowney   PetscErrorCode               ierr;
3229ae82921SPaul Mullowney 
3239ae82921SPaul Mullowney   PetscFunctionBegin;
324da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3259ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3269ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3279ae82921SPaul Mullowney   PetscFunctionReturn(0);
3289ae82921SPaul Mullowney }
3299ae82921SPaul Mullowney 
3306fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3319ae82921SPaul Mullowney {
332da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3339ae82921SPaul Mullowney   PetscErrorCode               ierr;
3349ae82921SPaul Mullowney 
3359ae82921SPaul Mullowney   PetscFunctionBegin;
336da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3379ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3389ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3399ae82921SPaul Mullowney   PetscFunctionReturn(0);
3409ae82921SPaul Mullowney }
3419ae82921SPaul Mullowney 
342087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
343087f3262SPaul Mullowney {
344da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
345087f3262SPaul Mullowney   PetscErrorCode               ierr;
346087f3262SPaul Mullowney 
347087f3262SPaul Mullowney   PetscFunctionBegin;
348da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
349087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
350087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
351087f3262SPaul Mullowney   PetscFunctionReturn(0);
352087f3262SPaul Mullowney }
353087f3262SPaul Mullowney 
354087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
355087f3262SPaul Mullowney {
356da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
357087f3262SPaul Mullowney   PetscErrorCode               ierr;
358087f3262SPaul Mullowney 
359087f3262SPaul Mullowney   PetscFunctionBegin;
360da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
361087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
362087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
363087f3262SPaul Mullowney   PetscFunctionReturn(0);
364087f3262SPaul Mullowney }
365087f3262SPaul Mullowney 
366087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3679ae82921SPaul Mullowney {
3689ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3699ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3709ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
371aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3729ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3739ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3749ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3759ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3769ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
377b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
37857d48284SJunchao Zhang   cudaError_t                       cerr;
3799ae82921SPaul Mullowney 
3809ae82921SPaul Mullowney   PetscFunctionBegin;
381cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
382c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3839ae82921SPaul Mullowney     try {
3849ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3859ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
386da79fbbcSStefano Zampini       if (!loTriFactor) {
3872cbc15d9SMark         PetscScalar                       *AALo;
3882cbc15d9SMark 
3892cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3909ae82921SPaul Mullowney 
3919ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
39257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
39357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3969ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3979ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3989ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3999ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4009ae82921SPaul Mullowney         v        = aa;
4019ae82921SPaul Mullowney         vi       = aj;
4029ae82921SPaul Mullowney         offset   = 1;
4039ae82921SPaul Mullowney         rowOffset= 1;
4049ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4059ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
406e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4079ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4089ae82921SPaul Mullowney           rowOffset += nz+1;
4099ae82921SPaul Mullowney 
410580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
411580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4129ae82921SPaul Mullowney 
4139ae82921SPaul Mullowney           offset      += nz;
4149ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4159ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4169ae82921SPaul Mullowney           offset      += 1;
4179ae82921SPaul Mullowney 
4189ae82921SPaul Mullowney           v  += nz;
4199ae82921SPaul Mullowney           vi += nz;
4209ae82921SPaul Mullowney         }
4212205254eSKarl Rupp 
422aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
423da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
424da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
425aa372e3fSPaul Mullowney         /* Create the matrix description */
42657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
42757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4281b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
429afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
430afb2bd1cSJunchao Zhang        #else
43157d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
432afb2bd1cSJunchao Zhang        #endif
43357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
43457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
435aa372e3fSPaul Mullowney 
436aa372e3fSPaul Mullowney         /* set the operation */
437aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
438aa372e3fSPaul Mullowney 
439aa372e3fSPaul Mullowney         /* set the matrix */
440aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
441aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
442aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
443aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
444aa372e3fSPaul Mullowney 
445aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
446aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
447aa372e3fSPaul Mullowney 
448aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
449aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
450aa372e3fSPaul Mullowney 
451aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
452aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
453aa372e3fSPaul Mullowney 
454afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
455da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
456afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4571b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
458afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
459afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
460afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
461afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
462afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
463afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
464afb2bd1cSJunchao Zhang       #endif
465afb2bd1cSJunchao Zhang 
466aa372e3fSPaul Mullowney         /* perform the solve analysis */
467aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
468aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
469aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
470afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4711b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
472afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
473afb2bd1cSJunchao Zhang                                #endif
474afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
475da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
476da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
477aa372e3fSPaul Mullowney 
478da79fbbcSStefano Zampini         /* assign the pointer */
479aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4802cbc15d9SMark         loTriFactor->AA_h = AALo;
48157d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
48257d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4834863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
484da79fbbcSStefano Zampini       } else { /* update values only */
4852cbc15d9SMark         if (!loTriFactor->AA_h) {
4862cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4872cbc15d9SMark         }
488da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4892cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
490da79fbbcSStefano Zampini         v        = aa;
491da79fbbcSStefano Zampini         vi       = aj;
492da79fbbcSStefano Zampini         offset   = 1;
493da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
494da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4952cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
496da79fbbcSStefano Zampini           offset      += nz;
4972cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
498da79fbbcSStefano Zampini           offset      += 1;
499da79fbbcSStefano Zampini           v  += nz;
500da79fbbcSStefano Zampini         }
5012cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
502da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
503da79fbbcSStefano Zampini       }
5049ae82921SPaul Mullowney     } catch(char *ex) {
5059ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5069ae82921SPaul Mullowney     }
5079ae82921SPaul Mullowney   }
5089ae82921SPaul Mullowney   PetscFunctionReturn(0);
5099ae82921SPaul Mullowney }
5109ae82921SPaul Mullowney 
511087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5129ae82921SPaul Mullowney {
5139ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5149ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5159ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
516aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5179ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5189ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5199ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5209ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5219ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5229ae82921SPaul Mullowney   PetscErrorCode                    ierr;
52357d48284SJunchao Zhang   cudaError_t                       cerr;
5249ae82921SPaul Mullowney 
5259ae82921SPaul Mullowney   PetscFunctionBegin;
526cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
527c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5289ae82921SPaul Mullowney     try {
5299ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5309ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
531da79fbbcSStefano Zampini       if (!upTriFactor) {
5322cbc15d9SMark         PetscScalar *AAUp;
5332cbc15d9SMark 
5342cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5352cbc15d9SMark 
5369ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
53757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
53857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5399ae82921SPaul Mullowney 
5409ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5419ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5429ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5439ae82921SPaul Mullowney         offset = nzUpper;
5449ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5459ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5469ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5479ae82921SPaul Mullowney 
548e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5499ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5509ae82921SPaul Mullowney 
551e057df02SPaul Mullowney           /* decrement the offset */
5529ae82921SPaul Mullowney           offset -= (nz+1);
5539ae82921SPaul Mullowney 
554e057df02SPaul Mullowney           /* first, set the diagonal elements */
5559ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
55609f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5579ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5589ae82921SPaul Mullowney 
559580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
560580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5619ae82921SPaul Mullowney         }
5622205254eSKarl Rupp 
563aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
564da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
565da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5662205254eSKarl Rupp 
567aa372e3fSPaul Mullowney         /* Create the matrix description */
56857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
56957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5701b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
571afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
572afb2bd1cSJunchao Zhang        #else
57357d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
574afb2bd1cSJunchao Zhang        #endif
57557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
57657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
577aa372e3fSPaul Mullowney 
578aa372e3fSPaul Mullowney         /* set the operation */
579aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
580aa372e3fSPaul Mullowney 
581aa372e3fSPaul Mullowney         /* set the matrix */
582aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
583aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
584aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
585aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
586aa372e3fSPaul Mullowney 
587aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
589aa372e3fSPaul Mullowney 
590aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
591aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
592aa372e3fSPaul Mullowney 
593aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
594aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
595aa372e3fSPaul Mullowney 
596afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
597da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
598afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5991b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
600afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
601afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
602afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
603afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
604afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
605afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
606afb2bd1cSJunchao Zhang       #endif
607afb2bd1cSJunchao Zhang 
608aa372e3fSPaul Mullowney         /* perform the solve analysis */
609aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
610aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
611aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
612afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
6131b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
614afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
615afb2bd1cSJunchao Zhang                                #endif
616afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
617da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
618da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
619aa372e3fSPaul Mullowney 
620da79fbbcSStefano Zampini         /* assign the pointer */
621aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6222cbc15d9SMark         upTriFactor->AA_h = AAUp;
62357d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
62457d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6254863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
626da79fbbcSStefano Zampini       } else {
6272cbc15d9SMark         if (!upTriFactor->AA_h) {
6282cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6292cbc15d9SMark         }
630da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
631da79fbbcSStefano Zampini         offset = nzUpper;
632da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
633da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
634da79fbbcSStefano Zampini 
635da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
636da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
637da79fbbcSStefano Zampini 
638da79fbbcSStefano Zampini           /* decrement the offset */
639da79fbbcSStefano Zampini           offset -= (nz+1);
640da79fbbcSStefano Zampini 
641da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6422cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6432cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
644da79fbbcSStefano Zampini         }
6452cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
646da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
647da79fbbcSStefano Zampini       }
6489ae82921SPaul Mullowney     } catch(char *ex) {
6499ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6509ae82921SPaul Mullowney     }
6519ae82921SPaul Mullowney   }
6529ae82921SPaul Mullowney   PetscFunctionReturn(0);
6539ae82921SPaul Mullowney }
6549ae82921SPaul Mullowney 
655087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6569ae82921SPaul Mullowney {
6579ae82921SPaul Mullowney   PetscErrorCode               ierr;
6589ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6599ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6609ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6619ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6629ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6639ae82921SPaul Mullowney 
6649ae82921SPaul Mullowney   PetscFunctionBegin;
665da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
666087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
667087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6682205254eSKarl Rupp 
669da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
670aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6719ae82921SPaul Mullowney 
672c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
673e057df02SPaul Mullowney   /* lower triangular indices */
6749ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
675da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
676da79fbbcSStefano Zampini     const PetscInt *r;
677da79fbbcSStefano Zampini 
678da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
679aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
680aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6819ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
682da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
683da79fbbcSStefano Zampini   }
6849ae82921SPaul Mullowney 
685e057df02SPaul Mullowney   /* upper triangular indices */
6869ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
687da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
688da79fbbcSStefano Zampini     const PetscInt *c;
689da79fbbcSStefano Zampini 
690da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
691aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
692aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6939ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
694da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
695da79fbbcSStefano Zampini   }
6969ae82921SPaul Mullowney   PetscFunctionReturn(0);
6979ae82921SPaul Mullowney }
6989ae82921SPaul Mullowney 
699087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
700087f3262SPaul Mullowney {
701087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
702087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
703aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
704aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
705087f3262SPaul Mullowney   cusparseStatus_t                  stat;
706087f3262SPaul Mullowney   PetscErrorCode                    ierr;
70757d48284SJunchao Zhang   cudaError_t                       cerr;
708087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
709087f3262SPaul Mullowney   PetscScalar                       *AAUp;
710087f3262SPaul Mullowney   PetscScalar                       *AALo;
711087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
712087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
713087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
714087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
715087f3262SPaul Mullowney 
716087f3262SPaul Mullowney   PetscFunctionBegin;
717cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
718c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
719087f3262SPaul Mullowney     try {
720da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
721da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
722da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
723087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
72457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
72557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
726087f3262SPaul Mullowney 
727087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
728087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
729087f3262SPaul Mullowney         AiUp[n]=nzUpper;
730087f3262SPaul Mullowney         offset = 0;
731087f3262SPaul Mullowney         for (i=0; i<n; i++) {
732087f3262SPaul Mullowney           /* set the pointers */
733087f3262SPaul Mullowney           v  = aa + ai[i];
734087f3262SPaul Mullowney           vj = aj + ai[i];
735087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
736087f3262SPaul Mullowney 
737087f3262SPaul Mullowney           /* first, set the diagonal elements */
738087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
73909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
740087f3262SPaul Mullowney           AiUp[i]      = offset;
74109f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
742087f3262SPaul Mullowney 
743087f3262SPaul Mullowney           offset+=1;
744087f3262SPaul Mullowney           if (nz>0) {
745f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
746580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
747087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
748087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
749087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
750087f3262SPaul Mullowney             }
751087f3262SPaul Mullowney             offset+=nz;
752087f3262SPaul Mullowney           }
753087f3262SPaul Mullowney         }
754087f3262SPaul Mullowney 
755aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
756da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
757da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
758087f3262SPaul Mullowney 
759aa372e3fSPaul Mullowney         /* Create the matrix description */
76057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
76157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7621b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
763afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
764afb2bd1cSJunchao Zhang        #else
76557d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
766afb2bd1cSJunchao Zhang        #endif
76757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
76857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
769087f3262SPaul Mullowney 
770aa372e3fSPaul Mullowney         /* set the matrix */
771aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
772aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
773aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
774aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
775aa372e3fSPaul Mullowney 
776aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
777aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
778aa372e3fSPaul Mullowney 
779aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
780aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
781aa372e3fSPaul Mullowney 
782aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
783aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
784aa372e3fSPaul Mullowney 
785afb2bd1cSJunchao Zhang         /* set the operation */
786afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
787afb2bd1cSJunchao Zhang 
788afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
789da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
790afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7911b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
792afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
793afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
794afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
795afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
796afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
797afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
798afb2bd1cSJunchao Zhang       #endif
799afb2bd1cSJunchao Zhang 
800aa372e3fSPaul Mullowney         /* perform the solve analysis */
801aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
802aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
803aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
804afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
8051b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
806afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
807afb2bd1cSJunchao Zhang                                 #endif
808afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
809da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
810da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
811aa372e3fSPaul Mullowney 
812da79fbbcSStefano Zampini         /* assign the pointer */
813aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
814aa372e3fSPaul Mullowney 
815aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
816da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
817da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
818aa372e3fSPaul Mullowney 
819aa372e3fSPaul Mullowney         /* Create the matrix description */
82057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
82157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8221b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
823afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
824afb2bd1cSJunchao Zhang        #else
82557d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
826afb2bd1cSJunchao Zhang        #endif
82757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
82857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
829aa372e3fSPaul Mullowney 
830aa372e3fSPaul Mullowney         /* set the operation */
831aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
832aa372e3fSPaul Mullowney 
833aa372e3fSPaul Mullowney         /* set the matrix */
834aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
835aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
836aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
837aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
838aa372e3fSPaul Mullowney 
839aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
840aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
841aa372e3fSPaul Mullowney 
842aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
843aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
846aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
847aa372e3fSPaul Mullowney 
848afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
849da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
850afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8511b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
852afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
853afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
854afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
855afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
856afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
857afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
858afb2bd1cSJunchao Zhang       #endif
859afb2bd1cSJunchao Zhang 
860aa372e3fSPaul Mullowney         /* perform the solve analysis */
861aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
862aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
863aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
864afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8651b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
866afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
867afb2bd1cSJunchao Zhang                                 #endif
868afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
869da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
870da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
871aa372e3fSPaul Mullowney 
872da79fbbcSStefano Zampini         /* assign the pointer */
873aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
874087f3262SPaul Mullowney 
875da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
87657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
87757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
878da79fbbcSStefano Zampini       } else {
879da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
880da79fbbcSStefano Zampini         offset = 0;
881da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
882da79fbbcSStefano Zampini           /* set the pointers */
883da79fbbcSStefano Zampini           v  = aa + ai[i];
884da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
885da79fbbcSStefano Zampini 
886da79fbbcSStefano Zampini           /* first, set the diagonal elements */
887da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
888da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
889da79fbbcSStefano Zampini 
890da79fbbcSStefano Zampini           offset+=1;
891da79fbbcSStefano Zampini           if (nz>0) {
892da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
893da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
894da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
895da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
896da79fbbcSStefano Zampini             }
897da79fbbcSStefano Zampini             offset+=nz;
898da79fbbcSStefano Zampini           }
899da79fbbcSStefano Zampini         }
900da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
901da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
902da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
903da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
904da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
905da79fbbcSStefano Zampini       }
90657d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
90757d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
908087f3262SPaul Mullowney     } catch(char *ex) {
909087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
910087f3262SPaul Mullowney     }
911087f3262SPaul Mullowney   }
912087f3262SPaul Mullowney   PetscFunctionReturn(0);
913087f3262SPaul Mullowney }
914087f3262SPaul Mullowney 
915087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9169ae82921SPaul Mullowney {
9179ae82921SPaul Mullowney   PetscErrorCode               ierr;
918087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
919087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
920087f3262SPaul Mullowney   IS                           ip = a->row;
921087f3262SPaul Mullowney   PetscBool                    perm_identity;
922087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
923087f3262SPaul Mullowney 
924087f3262SPaul Mullowney   PetscFunctionBegin;
925da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
926087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
927da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
928aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
929aa372e3fSPaul Mullowney 
930da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
931da79fbbcSStefano Zampini 
932087f3262SPaul Mullowney   /* lower triangular indices */
933087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
934087f3262SPaul Mullowney   if (!perm_identity) {
9354e4bbfaaSStefano Zampini     IS             iip;
936da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9374e4bbfaaSStefano Zampini 
9384e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9394e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
940da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
941aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
942aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
943aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9444e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9454e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9464e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
947087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
948da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
949da79fbbcSStefano Zampini   }
950087f3262SPaul Mullowney   PetscFunctionReturn(0);
951087f3262SPaul Mullowney }
952087f3262SPaul Mullowney 
953087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
954087f3262SPaul Mullowney {
955087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
956087f3262SPaul Mullowney   IS             ip = b->row;
957087f3262SPaul Mullowney   PetscBool      perm_identity;
958b175d8bbSPaul Mullowney   PetscErrorCode ierr;
959087f3262SPaul Mullowney 
960087f3262SPaul Mullowney   PetscFunctionBegin;
96157181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
962087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
963ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
964087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
965087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
966087f3262SPaul Mullowney   if (perm_identity) {
967087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
968087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9694e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9704e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
971087f3262SPaul Mullowney   } else {
972087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
973087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9744e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9754e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
976087f3262SPaul Mullowney   }
977087f3262SPaul Mullowney 
978087f3262SPaul Mullowney   /* get the triangular factors */
979087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
980087f3262SPaul Mullowney   PetscFunctionReturn(0);
981087f3262SPaul Mullowney }
9829ae82921SPaul Mullowney 
983b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
984bda325fcSPaul Mullowney {
985bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
986aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
987aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
988da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
989da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
990bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
991aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
992aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
993aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
994aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
9951b0a6780SStefano Zampini   cudaError_t                       cerr;
996da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
997b175d8bbSPaul Mullowney 
998bda325fcSPaul Mullowney   PetscFunctionBegin;
999aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1000da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1001da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1002aa372e3fSPaul Mullowney 
1003aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1004aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1005aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1006aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1007aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1008aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1009aa372e3fSPaul Mullowney 
1010aa372e3fSPaul Mullowney   /* Create the matrix description */
101157d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
101257d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
101357d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
101457d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
101557d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1016aa372e3fSPaul Mullowney 
1017aa372e3fSPaul Mullowney   /* set the operation */
1018aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1019aa372e3fSPaul Mullowney 
1020aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1021aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1022afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1023afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1024aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1025afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1026afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1027afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1028aa372e3fSPaul Mullowney 
1029aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1030afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1031afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1032afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1033afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1034afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1035afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1036afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1037afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1038afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1039afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10401b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1041afb2bd1cSJunchao Zhang #endif
1042afb2bd1cSJunchao Zhang 
1043da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1044aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1045aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1046aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1047aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1048aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1049aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1050afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1051afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1052afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1053afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1054afb2bd1cSJunchao Zhang                         #else
1055afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1056afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1057afb2bd1cSJunchao Zhang                         #endif
1058afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1059da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1060da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1061aa372e3fSPaul Mullowney 
1062afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1063da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1064afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10651b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1066afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1067afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1068afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1069afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1070afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1071afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1072afb2bd1cSJunchao Zhang #endif
1073afb2bd1cSJunchao Zhang 
1074afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1075aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1076afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1077afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1078afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10791b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1080afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1081afb2bd1cSJunchao Zhang                           #endif
1082afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1083da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1084da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1085aa372e3fSPaul Mullowney 
1086da79fbbcSStefano Zampini   /* assign the pointer */
1087aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /*********************************************/
1090aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1091aa372e3fSPaul Mullowney   /*********************************************/
1092aa372e3fSPaul Mullowney 
1093aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1094da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1095da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1096aa372e3fSPaul Mullowney 
1097aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1098aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1099aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1100aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1101aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1102aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1103aa372e3fSPaul Mullowney 
1104aa372e3fSPaul Mullowney   /* Create the matrix description */
110557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
110657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
110757d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
110857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
110957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1110aa372e3fSPaul Mullowney 
1111aa372e3fSPaul Mullowney   /* set the operation */
1112aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1113aa372e3fSPaul Mullowney 
1114aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1115aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1116afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1117afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1118aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1119afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1120afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1121afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1122aa372e3fSPaul Mullowney 
1123aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1124afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1125afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1126afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1127afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1128afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1129afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1130afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1131afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1132afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1133afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1134afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1135afb2bd1cSJunchao Zhang #endif
1136afb2bd1cSJunchao Zhang 
1137da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1138aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1139aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1140aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1141aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1142aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1143aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1144afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1145afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1146afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1147afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1148afb2bd1cSJunchao Zhang                         #else
1149afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1150afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1151afb2bd1cSJunchao Zhang                         #endif
1152afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1153da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1155aa372e3fSPaul Mullowney 
1156afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1157da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1158afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11591b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1160afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1161afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1162afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1163afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1164afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1165afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1166afb2bd1cSJunchao Zhang   #endif
1167afb2bd1cSJunchao Zhang 
1168afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1169aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1170afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1171afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1172afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11731b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1174afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1175afb2bd1cSJunchao Zhang                           #endif
1176afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1177da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1178da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1179aa372e3fSPaul Mullowney 
1180da79fbbcSStefano Zampini   /* assign the pointer */
1181aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1182bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1183bda325fcSPaul Mullowney }
1184bda325fcSPaul Mullowney 
1185a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1186a49f1ed0SStefano Zampini {
1187a49f1ed0SStefano Zampini   __host__ __device__
1188a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1189a49f1ed0SStefano Zampini   {
1190a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1191a49f1ed0SStefano Zampini   }
1192a49f1ed0SStefano Zampini };
1193a49f1ed0SStefano Zampini 
11941a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1195bda325fcSPaul Mullowney {
1196aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1197a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1198bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1199bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1200aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1201b06137fdSPaul Mullowney   cudaError_t                  err;
120285ba7357SStefano Zampini   PetscErrorCode               ierr;
1203b175d8bbSPaul Mullowney 
1204bda325fcSPaul Mullowney   PetscFunctionBegin;
12051a2c6b5cSJunchao Zhang   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1206a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1207a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1208*e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1209a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1210*e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12111a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
121285ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1213a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1214a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1215a49f1ed0SStefano Zampini   }
1216a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1217aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
121857d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1219aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
122057d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
122157d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1222aa372e3fSPaul Mullowney 
1223b06137fdSPaul Mullowney     /* set alpha and beta */
1224afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12257656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12267656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1227afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12287656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12297656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1230b06137fdSPaul Mullowney 
1231aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1232aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1233a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1234554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1235554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1236aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1237a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1238aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1239aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1240a3fdcf43SKarl Rupp 
1241039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
124281902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1243afb2bd1cSJunchao Zhang 
1244afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1245afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1246afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1247afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1248afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1249afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1250afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1251afb2bd1cSJunchao Zhang      #endif
1252aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1253afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1254afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1255afb2bd1cSJunchao Zhang    #else
1256aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
125751c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
125851c6d536SStefano Zampini       /* First convert HYB to CSR */
1259aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1260aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1261aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1262aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1263aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1264aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1265aa372e3fSPaul Mullowney 
1266aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1267aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1268aa372e3fSPaul Mullowney                               temp->values->data().get(),
1269aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
127057d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1271aa372e3fSPaul Mullowney 
1272aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1273aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1274aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1275aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1276aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1277aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1278aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1279aa372e3fSPaul Mullowney 
1280aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1281aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1282aa372e3fSPaul Mullowney                               temp->values->data().get(),
1283aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1284aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1285aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1286aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1287aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
128857d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1289aa372e3fSPaul Mullowney 
1290aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1291aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
129257d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1293aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1294aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1295aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1296aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1297aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1298aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
129957d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1300aa372e3fSPaul Mullowney 
1301aa372e3fSPaul Mullowney       /* assign the pointer */
1302aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13031a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1304aa372e3fSPaul Mullowney       /* delete temporaries */
1305aa372e3fSPaul Mullowney       if (tempT) {
1306aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1307aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1308aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1309aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1310087f3262SPaul Mullowney       }
1311aa372e3fSPaul Mullowney       if (temp) {
1312aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1313aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1314aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1315aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1316aa372e3fSPaul Mullowney       }
1317afb2bd1cSJunchao Zhang      #endif
1318aa372e3fSPaul Mullowney     }
1319a49f1ed0SStefano Zampini   }
1320a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1321a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1322a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1323*e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1324*e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1325*e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1326*e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1327*e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1328*e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1329*e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1330*e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1331a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1332a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1333a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1334a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1335a49f1ed0SStefano Zampini     }
1336a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1337a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1338a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1339a49f1ed0SStefano Zampini 
1340a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1341a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1342a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1343a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1344a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1345a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1346a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1347a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1348a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1349a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1350a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1351a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1352a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1353a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1354a49f1ed0SStefano Zampini      #endif
1355a49f1ed0SStefano Zampini 
13561a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13571a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13581a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13591a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13601a2c6b5cSJunchao Zhang 
13611a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13621a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13631a2c6b5cSJunchao Zhang         */
13641a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
13651a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
13661a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
13671a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
13681a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1369a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1370a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1371a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1372a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
13731a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1374a49f1ed0SStefano Zampini                              #else
1375a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
13761a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1377a49f1ed0SStefano Zampini                              #endif
13781a2c6b5cSJunchao Zhang       } else {
13791a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13801a2c6b5cSJunchao Zhang       }
13811a2c6b5cSJunchao Zhang 
1382a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1383a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1384a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1385a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1386a49f1ed0SStefano Zampini      #endif
1387a49f1ed0SStefano Zampini     }
1388a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1389a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1390a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1391a49f1ed0SStefano Zampini   }
139285ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1393213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1394213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1395aa372e3fSPaul Mullowney   /* assign the pointer */
1396aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
13971a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1398bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1399bda325fcSPaul Mullowney }
1400bda325fcSPaul Mullowney 
1401a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14026fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1403bda325fcSPaul Mullowney {
1404c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1405465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1406465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1407465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1408465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1409bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1410bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1411aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1412aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1413aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1414b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
141557d48284SJunchao Zhang   cudaError_t                           cerr;
1416bda325fcSPaul Mullowney 
1417bda325fcSPaul Mullowney   PetscFunctionBegin;
1418aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1419aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1420bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1421aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1422aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1423bda325fcSPaul Mullowney   }
1424bda325fcSPaul Mullowney 
1425bda325fcSPaul Mullowney   /* Get the GPU pointers */
1426c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1427c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1428c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1429c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1430bda325fcSPaul Mullowney 
14317a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1432aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1433a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1434c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1435c41cb2e2SAlejandro Lamas Daviña                xGPU);
1436aa372e3fSPaul Mullowney 
1437aa372e3fSPaul Mullowney   /* First, solve U */
1438aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1439afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14401b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1441afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1442afb2bd1cSJunchao Zhang                       #endif
1443afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1444aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1445aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1446aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1447aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1448afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14491b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1450afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1451afb2bd1cSJunchao Zhang                       #endif
1452afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1453aa372e3fSPaul Mullowney 
1454aa372e3fSPaul Mullowney   /* Then, solve L */
1455aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1456afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14571b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1458afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1459afb2bd1cSJunchao Zhang                       #endif
1460afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1461aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1462aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1463aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1464aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1465afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14661b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1467afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1468afb2bd1cSJunchao Zhang                       #endif
1469afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1470aa372e3fSPaul Mullowney 
1471aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1472a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1473c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1474aa372e3fSPaul Mullowney                tempGPU->begin());
1475aa372e3fSPaul Mullowney 
1476aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1477a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1478bda325fcSPaul Mullowney 
1479bda325fcSPaul Mullowney   /* restore */
1480c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1481c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
148205035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1483661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1484958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1485bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1486bda325fcSPaul Mullowney }
1487bda325fcSPaul Mullowney 
14886fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1489bda325fcSPaul Mullowney {
1490465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1491465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1492bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1493bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1494aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1495aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1496aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1497b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
149857d48284SJunchao Zhang   cudaError_t                       cerr;
1499bda325fcSPaul Mullowney 
1500bda325fcSPaul Mullowney   PetscFunctionBegin;
1501aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1502aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1503bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1504aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1505aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1506bda325fcSPaul Mullowney   }
1507bda325fcSPaul Mullowney 
1508bda325fcSPaul Mullowney   /* Get the GPU pointers */
1509c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1510c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1511bda325fcSPaul Mullowney 
15127a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1513aa372e3fSPaul Mullowney   /* First, solve U */
1514aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1515afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15161b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1517afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1518afb2bd1cSJunchao Zhang                       #endif
1519afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1520aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1521aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1522aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1523aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1524afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
15251b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1526afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1527afb2bd1cSJunchao Zhang                       #endif
1528afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1529aa372e3fSPaul Mullowney 
1530aa372e3fSPaul Mullowney   /* Then, solve L */
1531aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1532afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15331b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1534afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1535afb2bd1cSJunchao Zhang                       #endif
1536afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1537aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1538aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1539aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1540aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1541afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15421b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1543afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1544afb2bd1cSJunchao Zhang                       #endif
1545afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1546bda325fcSPaul Mullowney 
1547bda325fcSPaul Mullowney   /* restore */
1548c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1549c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
155005035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1551661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1552958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1553bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1554bda325fcSPaul Mullowney }
1555bda325fcSPaul Mullowney 
15566fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15579ae82921SPaul Mullowney {
1558465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1559465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1560465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1561465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15629ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15639ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1564aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1565aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1566aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1567b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
156857d48284SJunchao Zhang   cudaError_t                           cerr;
15699ae82921SPaul Mullowney 
15709ae82921SPaul Mullowney   PetscFunctionBegin;
1571ebc8f436SDominic Meiser 
1572e057df02SPaul Mullowney   /* Get the GPU pointers */
1573c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1574c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1575c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1576c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15779ae82921SPaul Mullowney 
15787a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1579aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1580a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1581c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15824e4bbfaaSStefano Zampini                tempGPU->begin());
1583aa372e3fSPaul Mullowney 
1584aa372e3fSPaul Mullowney   /* Next, solve L */
1585aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1586afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15871b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1588afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1589afb2bd1cSJunchao Zhang                       #endif
1590afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1591aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1592aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1593aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1594aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1595afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15961b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1597afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1598afb2bd1cSJunchao Zhang                       #endif
1599afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1600aa372e3fSPaul Mullowney 
1601aa372e3fSPaul Mullowney   /* Then, solve U */
1602aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1603afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16041b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1605afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1606afb2bd1cSJunchao Zhang                       #endif
1607afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1608aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1609aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1610aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1611aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1612afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
16131b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1615afb2bd1cSJunchao Zhang                       #endif
1616afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1617aa372e3fSPaul Mullowney 
16184e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1619a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16204e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16214e4bbfaaSStefano Zampini                xGPU);
16229ae82921SPaul Mullowney 
1623c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1624c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
162505035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1626661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1627958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16289ae82921SPaul Mullowney   PetscFunctionReturn(0);
16299ae82921SPaul Mullowney }
16309ae82921SPaul Mullowney 
16316fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16329ae82921SPaul Mullowney {
1633465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1634465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16359ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16369ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1637aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1638aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1639aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1640b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
164157d48284SJunchao Zhang   cudaError_t                       cerr;
16429ae82921SPaul Mullowney 
16439ae82921SPaul Mullowney   PetscFunctionBegin;
1644e057df02SPaul Mullowney   /* Get the GPU pointers */
1645c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1646c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16479ae82921SPaul Mullowney 
16487a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1649aa372e3fSPaul Mullowney   /* First, solve L */
1650aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1651afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16521b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1653afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1654afb2bd1cSJunchao Zhang                       #endif
1655afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1656aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1657aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1658aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1659aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1660afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1662afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1663afb2bd1cSJunchao Zhang                       #endif
1664afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1665aa372e3fSPaul Mullowney 
1666aa372e3fSPaul Mullowney   /* Next, solve U */
1667aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1668afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16691b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1670afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1671afb2bd1cSJunchao Zhang                       #endif
1672afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1673aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1674aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1675aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1676aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1677afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16781b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1679afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1680afb2bd1cSJunchao Zhang                       #endif
1681afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16829ae82921SPaul Mullowney 
1683c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1684c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
168505035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1686661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1687958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16889ae82921SPaul Mullowney   PetscFunctionReturn(0);
16899ae82921SPaul Mullowney }
16909ae82921SPaul Mullowney 
16917e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16927e8381f9SStefano Zampini {
16937e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16947e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16957e8381f9SStefano Zampini   cudaError_t        cerr;
16967e8381f9SStefano Zampini   PetscErrorCode     ierr;
16977e8381f9SStefano Zampini 
16987e8381f9SStefano Zampini   PetscFunctionBegin;
16997e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17007e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17017e8381f9SStefano Zampini 
17027e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17037e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17047e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17057e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17067e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17077e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17087e8381f9SStefano Zampini   }
17097e8381f9SStefano Zampini   PetscFunctionReturn(0);
17107e8381f9SStefano Zampini }
17117e8381f9SStefano Zampini 
17127e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17137e8381f9SStefano Zampini {
17147e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17157e8381f9SStefano Zampini   PetscErrorCode ierr;
17167e8381f9SStefano Zampini 
17177e8381f9SStefano Zampini   PetscFunctionBegin;
17187e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17197e8381f9SStefano Zampini   *array = a->a;
17207e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17217e8381f9SStefano Zampini   PetscFunctionReturn(0);
17227e8381f9SStefano Zampini }
17237e8381f9SStefano Zampini 
17246fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17259ae82921SPaul Mullowney {
1726aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17277c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17289ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1729213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17309ae82921SPaul Mullowney   PetscErrorCode               ierr;
1731aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1732abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1733b06137fdSPaul Mullowney   cudaError_t                  err;
17349ae82921SPaul Mullowney 
17359ae82921SPaul Mullowney   PetscFunctionBegin;
1736*e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1737c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1738a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1739a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1740afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
174185ba7357SStefano Zampini 
1742*e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
174385ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1744afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
174505035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17464863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
174785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1748a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
174934d6c7a5SJose E. Roman     } else {
1750abb89eb1SStefano Zampini       PetscInt nnz;
175185ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17527c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1753a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17547c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
175581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1756a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1757a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17589ae82921SPaul Mullowney       try {
17599ae82921SPaul Mullowney         if (a->compressedrow.use) {
17609ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17619ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17629ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17639ae82921SPaul Mullowney         } else {
1764213423ffSJunchao Zhang           m    = A->rmap->n;
1765213423ffSJunchao Zhang           ii   = a->i;
1766e6e9a74fSStefano Zampini           ridx = NULL;
17679ae82921SPaul Mullowney         }
1768*e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1769*e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1770abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1771abb89eb1SStefano Zampini         else nnz = a->nz;
17729ae82921SPaul Mullowney 
177385ba7357SStefano Zampini         /* create cusparse matrix */
1774abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1775aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
177657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
177757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
177857d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17799ae82921SPaul Mullowney 
1780afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17817656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17827656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1783afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17847656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17857656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
178657d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1787b06137fdSPaul Mullowney 
1788aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1789aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1790aa372e3fSPaul Mullowney           /* set the matrix */
1791afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1792afb2bd1cSJunchao Zhang           mat->num_rows = m;
1793afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1794abb89eb1SStefano Zampini           mat->num_entries = nnz;
1795afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1796afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17979ae82921SPaul Mullowney 
1798abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1799abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1800aa372e3fSPaul Mullowney 
1801abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1802abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1803aa372e3fSPaul Mullowney 
1804aa372e3fSPaul Mullowney           /* assign the pointer */
1805afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1806afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1807afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1808afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1809afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1810afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1811afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1812afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1813afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1814afb2bd1cSJunchao Zhang           }
1815afb2bd1cSJunchao Zhang          #endif
1816aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1817afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1818afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1819afb2bd1cSJunchao Zhang          #else
1820afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1821afb2bd1cSJunchao Zhang           mat->num_rows = m;
1822afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1823abb89eb1SStefano Zampini           mat->num_entries = nnz;
1824afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1825afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1826aa372e3fSPaul Mullowney 
1827abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1828abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1829aa372e3fSPaul Mullowney 
1830abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1831abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1832aa372e3fSPaul Mullowney 
1833aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
183457d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1835aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1836aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1837afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1838afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1839afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1840afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
184157d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1842aa372e3fSPaul Mullowney           /* assign the pointer */
1843aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1844aa372e3fSPaul Mullowney 
1845afb2bd1cSJunchao Zhang           if (mat) {
1846afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1847afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1848afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1849afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1850087f3262SPaul Mullowney           }
1851afb2bd1cSJunchao Zhang          #endif
1852087f3262SPaul Mullowney         }
1853ca45077fSPaul Mullowney 
1854aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1855213423ffSJunchao Zhang         if (a->compressedrow.use) {
1856213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1857aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1858aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1859213423ffSJunchao Zhang           tmp = m;
1860213423ffSJunchao Zhang         } else {
1861213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1862213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1863213423ffSJunchao Zhang           tmp = 0;
1864213423ffSJunchao Zhang         }
1865213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1866aa372e3fSPaul Mullowney 
1867aa372e3fSPaul Mullowney         /* assign the pointer */
1868aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18699ae82921SPaul Mullowney       } catch(char *ex) {
18709ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18719ae82921SPaul Mullowney       }
187205035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
187385ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
187434d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
187534d6c7a5SJose E. Roman     }
1876abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18779ae82921SPaul Mullowney   }
18789ae82921SPaul Mullowney   PetscFunctionReturn(0);
18799ae82921SPaul Mullowney }
18809ae82921SPaul Mullowney 
1881c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1882aa372e3fSPaul Mullowney {
1883aa372e3fSPaul Mullowney   template <typename Tuple>
1884aa372e3fSPaul Mullowney   __host__ __device__
1885aa372e3fSPaul Mullowney   void operator()(Tuple t)
1886aa372e3fSPaul Mullowney   {
1887aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1888aa372e3fSPaul Mullowney   }
1889aa372e3fSPaul Mullowney };
1890aa372e3fSPaul Mullowney 
18917e8381f9SStefano Zampini struct VecCUDAEquals
18927e8381f9SStefano Zampini {
18937e8381f9SStefano Zampini   template <typename Tuple>
18947e8381f9SStefano Zampini   __host__ __device__
18957e8381f9SStefano Zampini   void operator()(Tuple t)
18967e8381f9SStefano Zampini   {
18977e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18987e8381f9SStefano Zampini   }
18997e8381f9SStefano Zampini };
19007e8381f9SStefano Zampini 
1901e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1902e6e9a74fSStefano Zampini {
1903e6e9a74fSStefano Zampini   template <typename Tuple>
1904e6e9a74fSStefano Zampini   __host__ __device__
1905e6e9a74fSStefano Zampini   void operator()(Tuple t)
1906e6e9a74fSStefano Zampini   {
1907e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1908e6e9a74fSStefano Zampini   }
1909e6e9a74fSStefano Zampini };
1910e6e9a74fSStefano Zampini 
1911afb2bd1cSJunchao Zhang struct MatMatCusparse {
1912ccdfe979SStefano Zampini   PetscBool             cisdense;
1913ccdfe979SStefano Zampini   PetscScalar           *Bt;
1914ccdfe979SStefano Zampini   Mat                   X;
1915fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1916fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1917fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1918afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1919fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1920afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1921afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1922afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1923afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1924fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1925fcdce8c4SStefano Zampini   void                  *mmBuffer;
1926fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1927fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1928afb2bd1cSJunchao Zhang #endif
1929afb2bd1cSJunchao Zhang };
1930ccdfe979SStefano Zampini 
1931ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1932ccdfe979SStefano Zampini {
1933ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1934ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1935ccdfe979SStefano Zampini   cudaError_t      cerr;
1936fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1937fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1938fcdce8c4SStefano Zampini  #endif
1939ccdfe979SStefano Zampini 
1940ccdfe979SStefano Zampini   PetscFunctionBegin;
1941ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1942fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1943afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1944fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1945fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1946fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1947afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1948afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1949fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1950afb2bd1cSJunchao Zhang  #endif
1951ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1952ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1953ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1954ccdfe979SStefano Zampini }
1955ccdfe979SStefano Zampini 
1956ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1957ccdfe979SStefano Zampini 
1958ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1959ccdfe979SStefano Zampini {
1960ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1961ccdfe979SStefano Zampini   Mat                          A,B;
1962afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1963ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1964ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1965ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1966ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1967ccdfe979SStefano Zampini   const PetscScalar            *barray;
1968ccdfe979SStefano Zampini   PetscScalar                  *carray;
1969ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1970ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1971ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1972ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1973afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1974ccdfe979SStefano Zampini 
1975ccdfe979SStefano Zampini   PetscFunctionBegin;
1976ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1977*e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
1978ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1979ccdfe979SStefano Zampini   A    = product->A;
1980ccdfe979SStefano Zampini   B    = product->B;
1981ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1982*e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
1983ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1984ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1985ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1986ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1987ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
1988ccdfe979SStefano Zampini   switch (product->type) {
1989ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1990ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1991ccdfe979SStefano Zampini     mat = cusp->mat;
1992ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1993ccdfe979SStefano Zampini     m   = A->rmap->n;
1994ccdfe979SStefano Zampini     n   = B->cmap->n;
1995ccdfe979SStefano Zampini     break;
1996ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
19971a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
1998e6e9a74fSStefano Zampini       mat = cusp->mat;
1999e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2000e6e9a74fSStefano Zampini     } else {
20011a2c6b5cSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2002ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2003ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2004e6e9a74fSStefano Zampini     }
2005ccdfe979SStefano Zampini     m = A->cmap->n;
2006ccdfe979SStefano Zampini     n = B->cmap->n;
2007ccdfe979SStefano Zampini     break;
2008ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2009ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2010ccdfe979SStefano Zampini     mat = cusp->mat;
2011ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2012ccdfe979SStefano Zampini     m   = A->rmap->n;
2013ccdfe979SStefano Zampini     n   = B->rmap->n;
2014ccdfe979SStefano Zampini     break;
2015ccdfe979SStefano Zampini   default:
2016*e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2017ccdfe979SStefano Zampini   }
2018*e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2019ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2020ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2021ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2022afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2023ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2024afb2bd1cSJunchao Zhang 
2025ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2026c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2027c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2028c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2029c8378d12SStefano Zampini   } else {
2030c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2031c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2032c8378d12SStefano Zampini   }
2033c8378d12SStefano Zampini 
2034c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2035afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2036afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2037fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2038afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2039fcdce8c4SStefano Zampini     size_t mmBufferSize;
2040afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2041afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2042afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2043afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2044afb2bd1cSJunchao Zhang     }
2045c8378d12SStefano Zampini 
2046afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2047afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2048afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2049afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2050afb2bd1cSJunchao Zhang     }
2051afb2bd1cSJunchao Zhang 
2052afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2053afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2054afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2055afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2056afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2057afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2058afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2059afb2bd1cSJunchao Zhang     }
2060afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2061afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2062afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2063fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2064fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2065fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2066fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2067fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2068fcdce8c4SStefano Zampini     }
2069afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2070afb2bd1cSJunchao Zhang   } else {
2071afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2072afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2073afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2074afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2075afb2bd1cSJunchao Zhang   }
2076afb2bd1cSJunchao Zhang 
2077afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2078afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2079afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2080afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2081fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2082afb2bd1cSJunchao Zhang  #else
2083afb2bd1cSJunchao Zhang   PetscInt k;
2084afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2085ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2086ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2087ccdfe979SStefano Zampini     cublasStatus_t cerr;
2088ccdfe979SStefano Zampini 
2089ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2090ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2091ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2092ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2093ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2094ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2095ccdfe979SStefano Zampini     blda = B->cmap->n;
2096afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2097afb2bd1cSJunchao Zhang   } else {
2098afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2099ccdfe979SStefano Zampini   }
2100ccdfe979SStefano Zampini 
2101afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2102ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2103afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2104ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2105ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2106ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2107ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2108ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2109afb2bd1cSJunchao Zhang  #endif
2110afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2111c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2112c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2113ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2114ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2115ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2116ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2117ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2118ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2119ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2120ccdfe979SStefano Zampini   } else {
2121ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2122ccdfe979SStefano Zampini   }
2123ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2124ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2125ccdfe979SStefano Zampini   }
2126ccdfe979SStefano Zampini   if (!biscuda) {
2127ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2128ccdfe979SStefano Zampini   }
2129ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2130ccdfe979SStefano Zampini }
2131ccdfe979SStefano Zampini 
2132ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2133ccdfe979SStefano Zampini {
2134ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2135ccdfe979SStefano Zampini   Mat                A,B;
2136ccdfe979SStefano Zampini   PetscInt           m,n;
2137ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2138ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2139ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2140ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2141ccdfe979SStefano Zampini 
2142ccdfe979SStefano Zampini   PetscFunctionBegin;
2143ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2144*e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2145ccdfe979SStefano Zampini   A    = product->A;
2146ccdfe979SStefano Zampini   B    = product->B;
2147ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2148*e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2149ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2150*e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2151ccdfe979SStefano Zampini   switch (product->type) {
2152ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2153ccdfe979SStefano Zampini     m = A->rmap->n;
2154ccdfe979SStefano Zampini     n = B->cmap->n;
2155ccdfe979SStefano Zampini     break;
2156ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2157ccdfe979SStefano Zampini     m = A->cmap->n;
2158ccdfe979SStefano Zampini     n = B->cmap->n;
2159ccdfe979SStefano Zampini     break;
2160ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2161ccdfe979SStefano Zampini     m = A->rmap->n;
2162ccdfe979SStefano Zampini     n = B->rmap->n;
2163ccdfe979SStefano Zampini     break;
2164ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2165ccdfe979SStefano Zampini     m = B->cmap->n;
2166ccdfe979SStefano Zampini     n = B->cmap->n;
2167ccdfe979SStefano Zampini     break;
2168ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2169ccdfe979SStefano Zampini     m = B->rmap->n;
2170ccdfe979SStefano Zampini     n = B->rmap->n;
2171ccdfe979SStefano Zampini     break;
2172ccdfe979SStefano Zampini   default:
2173*e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2174ccdfe979SStefano Zampini   }
2175ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2176ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2177ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2178ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2179ccdfe979SStefano Zampini 
2180ccdfe979SStefano Zampini   /* product data */
2181ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2182ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2183afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2184afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2185ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2186afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2187ccdfe979SStefano Zampini   }
2188afb2bd1cSJunchao Zhang  #endif
2189ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2190ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2191ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2192ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2193ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2194ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2195ccdfe979SStefano Zampini     } else {
2196ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2197ccdfe979SStefano Zampini     }
2198ccdfe979SStefano Zampini   }
2199ccdfe979SStefano Zampini   C->product->data    = mmdata;
2200ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2201ccdfe979SStefano Zampini 
2202ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2203ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2204ccdfe979SStefano Zampini }
2205ccdfe979SStefano Zampini 
2206fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2207ccdfe979SStefano Zampini {
2208ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2209fcdce8c4SStefano Zampini   Mat                          A,B;
2210fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2211fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2212fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2213fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2214fcdce8c4SStefano Zampini   PetscBool                    flg;
2215ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2216fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2217fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2218fcdce8c4SStefano Zampini   MatProductType               ptype;
2219fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2220fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2221fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2222fcdce8c4SStefano Zampini #endif
2223ccdfe979SStefano Zampini 
2224ccdfe979SStefano Zampini   PetscFunctionBegin;
2225ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2226*e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2227fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2228*e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2229fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2230fcdce8c4SStefano Zampini   A = product->A;
2231fcdce8c4SStefano Zampini   B = product->B;
2232fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2233fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2234fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2235*e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2236fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2237*e8d2b73aSMark Adams     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2238fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2239*e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2240fcdce8c4SStefano Zampini     goto finalize;
2241fcdce8c4SStefano Zampini   }
2242fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2243fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2244*e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2245fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2246*e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2247fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2248fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2249fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2250fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2251fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2252*e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2253*e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2254*e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2255fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2256fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2257fcdce8c4SStefano Zampini 
2258fcdce8c4SStefano Zampini   ptype = product->type;
2259fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2260fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2261fcdce8c4SStefano Zampini   switch (ptype) {
2262fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2263fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2264fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2265fcdce8c4SStefano Zampini     break;
2266fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2267fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2268fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2269fcdce8c4SStefano Zampini     break;
2270fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2271fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2272fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2273fcdce8c4SStefano Zampini     break;
2274fcdce8c4SStefano Zampini   default:
2275*e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2276fcdce8c4SStefano Zampini   }
2277fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2278*e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2279*e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2280*e8d2b73aSMark Adams   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2281fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2282fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2283fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2284*e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2285*e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2286*e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2287fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2288fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2289fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2290fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2291fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2292fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2293fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2294fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2295fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2296fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2297fcdce8c4SStefano Zampini #else
2298fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2299fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2300fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2301fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2302fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2303fcdce8c4SStefano Zampini #endif
2304fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2305fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2306fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2307fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2308fcdce8c4SStefano Zampini finalize:
2309fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2310fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2311fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2312fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2313fcdce8c4SStefano Zampini   c->reallocs         = 0;
2314fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2315fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2316fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2317fcdce8c4SStefano Zampini   C->num_ass++;
2318ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2319ccdfe979SStefano Zampini }
2320fcdce8c4SStefano Zampini 
2321fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2322fcdce8c4SStefano Zampini {
2323fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2324fcdce8c4SStefano Zampini   Mat                          A,B;
2325fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2326fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2327fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2328fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2329fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2330fcdce8c4SStefano Zampini   PetscBool                    flg;
2331fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2332fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2333fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2334fcdce8c4SStefano Zampini   MatProductType               ptype;
2335fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2336fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2337fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2338fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2339fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2340fcdce8c4SStefano Zampini   size_t                       bufSize2;
2341fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2342fcdce8c4SStefano Zampini #else
2343fcdce8c4SStefano Zampini   int                          cnz;
2344fcdce8c4SStefano Zampini #endif
2345fcdce8c4SStefano Zampini 
2346fcdce8c4SStefano Zampini   PetscFunctionBegin;
2347fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2348*e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2349fcdce8c4SStefano Zampini   A    = product->A;
2350fcdce8c4SStefano Zampini   B    = product->B;
2351fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2352*e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2353fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2354*e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2355fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2356fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2357fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2358fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2359*e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2360*e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2361fcdce8c4SStefano Zampini 
2362fcdce8c4SStefano Zampini   /* product data */
2363fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2364fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2365fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2366fcdce8c4SStefano Zampini 
2367fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2368fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2369fcdce8c4SStefano Zampini   ptype = product->type;
2370fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2371fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2372fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2373fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2374fcdce8c4SStefano Zampini   switch (ptype) {
2375fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2376fcdce8c4SStefano Zampini     m = A->rmap->n;
2377fcdce8c4SStefano Zampini     n = B->cmap->n;
2378fcdce8c4SStefano Zampini     k = A->cmap->n;
2379fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2380fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2381fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2382fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2383fcdce8c4SStefano Zampini     break;
2384fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2385fcdce8c4SStefano Zampini     m = A->cmap->n;
2386fcdce8c4SStefano Zampini     n = B->cmap->n;
2387fcdce8c4SStefano Zampini     k = A->rmap->n;
23881a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2389fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2390fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2391fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2392fcdce8c4SStefano Zampini     break;
2393fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2394fcdce8c4SStefano Zampini     m = A->rmap->n;
2395fcdce8c4SStefano Zampini     n = B->rmap->n;
2396fcdce8c4SStefano Zampini     k = A->cmap->n;
23971a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2398fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2399fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2400fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2401fcdce8c4SStefano Zampini     break;
2402fcdce8c4SStefano Zampini   default:
2403*e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2404fcdce8c4SStefano Zampini   }
2405fcdce8c4SStefano Zampini 
2406fcdce8c4SStefano Zampini   /* create cusparse matrix */
2407fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2408fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2409fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2410fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2411fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2412fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2413fcdce8c4SStefano Zampini 
2414fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2415fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2416fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2417fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2418fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2419fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2420fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2421fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2422fcdce8c4SStefano Zampini   } else {
2423fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2424fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2425fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2426fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2427fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2428fcdce8c4SStefano Zampini   }
2429fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2430fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2431fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2432fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2433fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2434fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2435fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2436fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2437fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2438fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2439fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2440fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2441fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2442fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2443fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2444fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2445fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2446fcdce8c4SStefano Zampini     c->nz = 0;
2447fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2448fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2449fcdce8c4SStefano Zampini     goto finalizesym;
2450fcdce8c4SStefano Zampini   }
2451fcdce8c4SStefano Zampini 
2452*e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2453*e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2454fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2455fcdce8c4SStefano Zampini   if (!biscompressed) {
2456fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2457fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2458fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2459fcdce8c4SStefano Zampini #endif
2460fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2461fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2462fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2463fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2464fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2465fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2466fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2467fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2468fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2469fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2470fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2471fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2472fcdce8c4SStefano Zampini     }
2473fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2474fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2475fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2476fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2477fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2478fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2479fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2480fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2481fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2482fcdce8c4SStefano Zampini     }
2483fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2484fcdce8c4SStefano Zampini #endif
2485fcdce8c4SStefano Zampini   }
2486*e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2487*e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2488fcdce8c4SStefano Zampini   /* precompute flops count */
2489fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2490fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2491fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2492fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2493fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2494fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2495fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2496fcdce8c4SStefano Zampini       }
2497fcdce8c4SStefano Zampini     }
2498fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2499fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2500fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2501fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2502fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2503fcdce8c4SStefano Zampini     }
2504fcdce8c4SStefano Zampini   } else { /* TODO */
2505fcdce8c4SStefano Zampini     flops = 0.;
2506fcdce8c4SStefano Zampini   }
2507fcdce8c4SStefano Zampini 
2508fcdce8c4SStefano Zampini   mmdata->flops = flops;
2509fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2510fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2511fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2512fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2513fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2514fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2515fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2516fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2517fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2518fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2519fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2520fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2521fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2522bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2523fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2524fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2525fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2526fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2527fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2528fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2529fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2530fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2531fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2532fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2533fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2534fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2535fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2536fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2537fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2538bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2539fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2540fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2541fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2542fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2543fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2544fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2545fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2546fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
254700702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2548fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2549fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2550fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2551fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2552fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2553fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2554fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2555fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2556fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2557fcdce8c4SStefano Zampini #else
2558fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2559fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2560fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2561fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2562fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2563fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2564fcdce8c4SStefano Zampini   c->nz = cnz;
2565fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2566fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2567fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2568fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2569fcdce8c4SStefano Zampini 
2570fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2571fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2572fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2573fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2574fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2575fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2576fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2577fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2578fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2579fcdce8c4SStefano Zampini #endif
2580fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2581fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2582fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2583fcdce8c4SStefano Zampini finalizesym:
2584fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2585fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2586fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2587fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2588fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2589fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2590fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2591fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2592fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2593fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2594fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2595fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2596fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2597fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2598fcdce8c4SStefano Zampini   } else {
2599fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2600fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2601fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2602fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2603fcdce8c4SStefano Zampini   }
2604fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2605fcdce8c4SStefano Zampini     PetscInt r = 0;
2606fcdce8c4SStefano Zampini     c->i[0] = 0;
2607fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2608fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2609fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2610fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2611fcdce8c4SStefano Zampini     }
2612fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2613fcdce8c4SStefano Zampini   }
2614fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2615fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2616fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2617fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2618fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2619fcdce8c4SStefano Zampini   c->rmax = 0;
2620fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2621fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2622fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2623fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2624fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2625fcdce8c4SStefano Zampini   }
2626fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2627fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2628fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2629fcdce8c4SStefano Zampini 
2630fcdce8c4SStefano Zampini   C->nonzerostate++;
2631fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2632fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2633fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2634fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2635fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2636fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2637fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2638abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2639fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2640fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2641fcdce8c4SStefano Zampini   }
2642fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2643fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2644fcdce8c4SStefano Zampini }
2645fcdce8c4SStefano Zampini 
2646fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2647fcdce8c4SStefano Zampini 
2648fcdce8c4SStefano Zampini /* handles sparse or dense B */
2649fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2650fcdce8c4SStefano Zampini {
2651fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2652fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2653fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2654fcdce8c4SStefano Zampini 
2655fcdce8c4SStefano Zampini   PetscFunctionBegin;
2656fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2657fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2658abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2659fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2660fcdce8c4SStefano Zampini   }
2661fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2662fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2663fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2664fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2665fcdce8c4SStefano Zampini     }
2666fcdce8c4SStefano Zampini   }
2667fcdce8c4SStefano Zampini   if (isdense) {
2668ccdfe979SStefano Zampini     switch (product->type) {
2669ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2670ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2671ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2672ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2673ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2674fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2675fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2676fcdce8c4SStefano Zampini       } else {
2677fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2678fcdce8c4SStefano Zampini       }
2679fcdce8c4SStefano Zampini       break;
2680fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2681fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2682fcdce8c4SStefano Zampini       break;
2683ccdfe979SStefano Zampini     default:
2684ccdfe979SStefano Zampini       break;
2685ccdfe979SStefano Zampini     }
2686fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2687fcdce8c4SStefano Zampini     switch (product->type) {
2688fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2689fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2690fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2691fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2692fcdce8c4SStefano Zampini       break;
2693fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2694fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2695fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2696fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2697fcdce8c4SStefano Zampini       break;
2698fcdce8c4SStefano Zampini     default:
2699fcdce8c4SStefano Zampini       break;
2700fcdce8c4SStefano Zampini     }
2701fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2702fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2703fcdce8c4SStefano Zampini   }
2704ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2705ccdfe979SStefano Zampini }
2706ccdfe979SStefano Zampini 
27076fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
27089ae82921SPaul Mullowney {
2709b175d8bbSPaul Mullowney   PetscErrorCode ierr;
27109ae82921SPaul Mullowney 
27119ae82921SPaul Mullowney   PetscFunctionBegin;
2712e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2713e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2714e6e9a74fSStefano Zampini }
2715e6e9a74fSStefano Zampini 
2716e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2717e6e9a74fSStefano Zampini {
2718e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2719e6e9a74fSStefano Zampini 
2720e6e9a74fSStefano Zampini   PetscFunctionBegin;
2721e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2722e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2723e6e9a74fSStefano Zampini }
2724e6e9a74fSStefano Zampini 
2725e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2726e6e9a74fSStefano Zampini {
2727e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2728e6e9a74fSStefano Zampini 
2729e6e9a74fSStefano Zampini   PetscFunctionBegin;
2730e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2731e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2732e6e9a74fSStefano Zampini }
2733e6e9a74fSStefano Zampini 
2734e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2735e6e9a74fSStefano Zampini {
2736e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2737e6e9a74fSStefano Zampini 
2738e6e9a74fSStefano Zampini   PetscFunctionBegin;
2739e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27409ae82921SPaul Mullowney   PetscFunctionReturn(0);
27419ae82921SPaul Mullowney }
27429ae82921SPaul Mullowney 
27436fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2744ca45077fSPaul Mullowney {
2745b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2746ca45077fSPaul Mullowney 
2747ca45077fSPaul Mullowney   PetscFunctionBegin;
2748e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2749ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2750ca45077fSPaul Mullowney }
2751ca45077fSPaul Mullowney 
2752a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2753a0e72f99SJunchao Zhang {
2754a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2755a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2756a0e72f99SJunchao Zhang }
2757a0e72f99SJunchao Zhang 
2758afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2759e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27609ae82921SPaul Mullowney {
27619ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2762aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27639ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2764e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2765b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
276657d48284SJunchao Zhang   cudaError_t                  cerr;
2767aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2768e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2769e6e9a74fSStefano Zampini   PetscBool                    compressed;
2770afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2771afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2772afb2bd1cSJunchao Zhang #endif
27736e111a19SKarl Rupp 
27749ae82921SPaul Mullowney   PetscFunctionBegin;
2775*e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
2776e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2777afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2778d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2779e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2780e6e9a74fSStefano Zampini   }
278134d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
278234d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2783e6e9a74fSStefano Zampini   if (!trans) {
27849ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2785*e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2786e6e9a74fSStefano Zampini   } else {
27871a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
2788e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2789e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2790e6e9a74fSStefano Zampini     } else {
27911a2c6b5cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2792e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2793e6e9a74fSStefano Zampini     }
2794e6e9a74fSStefano Zampini   }
2795e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2796e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2797213423ffSJunchao Zhang 
2798e6e9a74fSStefano Zampini   try {
2799e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2800213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2801213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2802afb2bd1cSJunchao Zhang 
280385ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2804e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2805afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2806afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2807afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2808afb2bd1cSJunchao Zhang       */
2809e6e9a74fSStefano Zampini       xptr = xarray;
2810afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2811213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2812afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2813afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2814afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2815afb2bd1cSJunchao Zhang        */
2816afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2817afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2818afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2819afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2820afb2bd1cSJunchao Zhang       }
2821afb2bd1cSJunchao Zhang      #endif
2822e6e9a74fSStefano Zampini     } else {
2823afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2824afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2825afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2826afb2bd1cSJunchao Zhang        */
2827afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2828e6e9a74fSStefano Zampini       dptr = zarray;
2829e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2830afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2831e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2832a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2833e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2834e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2835e6e9a74fSStefano Zampini       }
2836afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2837afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2838afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2839afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2840afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2841afb2bd1cSJunchao Zhang       }
2842afb2bd1cSJunchao Zhang      #endif
2843e6e9a74fSStefano Zampini     }
28449ae82921SPaul Mullowney 
2845afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2846aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2847afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2848afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2849afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2850afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2851afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2852afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2853afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2854afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2855afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2856afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2857afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2858afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2859afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2860afb2bd1cSJunchao Zhang 
2861afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2862afb2bd1cSJunchao Zhang       } else {
2863afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2864afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2865afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2866afb2bd1cSJunchao Zhang       }
2867afb2bd1cSJunchao Zhang 
2868afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2869afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
28701a2c6b5cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2871afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2872afb2bd1cSJunchao Zhang                                beta,
2873afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2874afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2875afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2876afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2877afb2bd1cSJunchao Zhang      #else
28787656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2879e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2880a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2881afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2882aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2883e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
288457d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2885afb2bd1cSJunchao Zhang      #endif
2886aa372e3fSPaul Mullowney     } else {
2887213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2888afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2889afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2890afb2bd1cSJunchao Zhang        #else
2891301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2892e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2893afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2894e6e9a74fSStefano Zampini                                  xptr, beta,
289557d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2896afb2bd1cSJunchao Zhang        #endif
2897a65300a6SPaul Mullowney       }
2898aa372e3fSPaul Mullowney     }
289905035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2900958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2901aa372e3fSPaul Mullowney 
2902e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2903213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2904213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2905213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2906e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2907213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
29087656d835SStefano Zampini         }
2909213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2910c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
29117656d835SStefano Zampini       }
29127656d835SStefano Zampini 
2913213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2914213423ffSJunchao Zhang       if (compressed) {
2915e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2916a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2917a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2918a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
2919a0e72f99SJunchao Zhang          */
2920a0e72f99SJunchao Zhang        #if 0
2921a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2922a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2923a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2924e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2925c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
2926a0e72f99SJunchao Zhang        #else
2927a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
2928a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2929a0e72f99SJunchao Zhang        #endif
293005035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2931958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2932e6e9a74fSStefano Zampini       }
2933e6e9a74fSStefano Zampini     } else {
2934e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2935e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2936e6e9a74fSStefano Zampini       }
2937e6e9a74fSStefano Zampini     }
2938e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2939213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2940213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29419ae82921SPaul Mullowney   } catch(char *ex) {
29429ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29439ae82921SPaul Mullowney   }
2944e6e9a74fSStefano Zampini   if (yy) {
2945958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2946e6e9a74fSStefano Zampini   } else {
2947e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2948e6e9a74fSStefano Zampini   }
29499ae82921SPaul Mullowney   PetscFunctionReturn(0);
29509ae82921SPaul Mullowney }
29519ae82921SPaul Mullowney 
29526fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2953ca45077fSPaul Mullowney {
2954b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29556e111a19SKarl Rupp 
2956ca45077fSPaul Mullowney   PetscFunctionBegin;
2957e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2958ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2959ca45077fSPaul Mullowney }
2960ca45077fSPaul Mullowney 
29616fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29629ae82921SPaul Mullowney {
29639ae82921SPaul Mullowney   PetscErrorCode              ierr;
2964a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29659ae82921SPaul Mullowney   PetscFunctionBegin;
2966bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29673fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2968bc3f50f2SPaul Mullowney   }
29693fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29703fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2971a587d139SMark   if (d_mat) {
29723fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29733fa6b06aSMark Adams   }
29743fa6b06aSMark Adams 
29759ae82921SPaul Mullowney   PetscFunctionReturn(0);
29769ae82921SPaul Mullowney }
29779ae82921SPaul Mullowney 
29789ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2979e057df02SPaul Mullowney /*@
29809ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2981e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2982e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2983e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2984e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2985e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29869ae82921SPaul Mullowney 
2987d083f849SBarry Smith    Collective
29889ae82921SPaul Mullowney 
29899ae82921SPaul Mullowney    Input Parameters:
29909ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29919ae82921SPaul Mullowney .  m - number of rows
29929ae82921SPaul Mullowney .  n - number of columns
29939ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29949ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29950298fd71SBarry Smith          (possibly different for each row) or NULL
29969ae82921SPaul Mullowney 
29979ae82921SPaul Mullowney    Output Parameter:
29989ae82921SPaul Mullowney .  A - the matrix
29999ae82921SPaul Mullowney 
30009ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
30019ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
30029ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
30039ae82921SPaul Mullowney 
30049ae82921SPaul Mullowney    Notes:
30059ae82921SPaul Mullowney    If nnz is given then nz is ignored
30069ae82921SPaul Mullowney 
30079ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
30089ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
30099ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
30109ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
30119ae82921SPaul Mullowney 
30129ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
30130298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
30149ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
30159ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
30169ae82921SPaul Mullowney 
30179ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
30189ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
30199ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
30209ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
30219ae82921SPaul Mullowney 
30229ae82921SPaul Mullowney    Level: intermediate
30239ae82921SPaul Mullowney 
3024e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
30259ae82921SPaul Mullowney @*/
30269ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
30279ae82921SPaul Mullowney {
30289ae82921SPaul Mullowney   PetscErrorCode ierr;
30299ae82921SPaul Mullowney 
30309ae82921SPaul Mullowney   PetscFunctionBegin;
30319ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
30329ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
30339ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
30349ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
30359ae82921SPaul Mullowney   PetscFunctionReturn(0);
30369ae82921SPaul Mullowney }
30379ae82921SPaul Mullowney 
30386fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30399ae82921SPaul Mullowney {
30409ae82921SPaul Mullowney   PetscErrorCode              ierr;
30413fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3042ab25e6cbSDominic Meiser 
30439ae82921SPaul Mullowney   PetscFunctionBegin;
30449ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30453fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30463fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3047470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30489ae82921SPaul Mullowney   } else {
3049470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3050aa372e3fSPaul Mullowney   }
30513fa6b06aSMark Adams   if (d_mat) {
30523fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30533fa6b06aSMark Adams     cudaError_t                err;
30543fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30553fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30563fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30573fa6b06aSMark Adams     if (a->compressedrow.use) {
30583fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30593fa6b06aSMark Adams     }
30603fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30613fa6b06aSMark Adams   }
3062c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3063ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3064ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3065ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3066fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3067ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30687e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30697e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30709ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30719ae82921SPaul Mullowney   PetscFunctionReturn(0);
30729ae82921SPaul Mullowney }
30739ae82921SPaul Mullowney 
3074ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
307595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30769ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30779ff858a8SKarl Rupp {
30789ff858a8SKarl Rupp   PetscErrorCode ierr;
30799ff858a8SKarl Rupp 
30809ff858a8SKarl Rupp   PetscFunctionBegin;
30819ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3082ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30839ff858a8SKarl Rupp   PetscFunctionReturn(0);
30849ff858a8SKarl Rupp }
30859ff858a8SKarl Rupp 
3086039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
308795639643SRichard Tran Mills {
3088e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3089a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3090039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3091039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3092039c6fbaSStefano Zampini   PetscScalar        *ay;
3093039c6fbaSStefano Zampini   const PetscScalar  *ax;
3094039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3095039c6fbaSStefano Zampini   cudaError_t        cerr;
3096e6e9a74fSStefano Zampini 
309795639643SRichard Tran Mills   PetscFunctionBegin;
3098a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3099a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3100039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3101a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3102a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3103a587d139SMark     PetscFunctionReturn(0);
310495639643SRichard Tran Mills   }
3105039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3106a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3107a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3108*e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3109*e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3110039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3111039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3112039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3113039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3114039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3115039c6fbaSStefano Zampini     if (eq) {
3116039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3117039c6fbaSStefano Zampini     }
3118039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3119039c6fbaSStefano Zampini   }
3120d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3121d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3122039c6fbaSStefano Zampini 
3123039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3124039c6fbaSStefano Zampini     cusparseStatus_t stat;
3125039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3126039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3127039c6fbaSStefano Zampini     size_t           bufferSize;
3128039c6fbaSStefano Zampini     void             *buffer;
3129039c6fbaSStefano Zampini #endif
3130039c6fbaSStefano Zampini 
3131039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3132039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3133039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3134039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3135039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3136039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3137039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3138039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3139039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3140039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3141039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3142039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3143039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3144039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3145039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3146039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3147039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3148039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3149039c6fbaSStefano Zampini #else
3150039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3151039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3152039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3153039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3154039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3155039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3156039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3157039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3158039c6fbaSStefano Zampini #endif
3159039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3160039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3161039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3162039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3163039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3164a587d139SMark     cublasHandle_t cublasv2handle;
3165039c6fbaSStefano Zampini     cublasStatus_t berr;
3166a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3167039c6fbaSStefano Zampini 
3168039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3169039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3170a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3171a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3172a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3173039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3174039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3175a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3176a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3177039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3178039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3179a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3180039c6fbaSStefano Zampini   } else {
3181a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3182d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3183a587d139SMark   }
318495639643SRichard Tran Mills   PetscFunctionReturn(0);
318595639643SRichard Tran Mills }
318695639643SRichard Tran Mills 
318733c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
318833c9ba73SStefano Zampini {
318933c9ba73SStefano Zampini   PetscErrorCode ierr;
319033c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
319133c9ba73SStefano Zampini   PetscScalar    *ay;
319233c9ba73SStefano Zampini   cudaError_t    cerr;
319333c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
319433c9ba73SStefano Zampini   cublasStatus_t berr;
319533c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
319633c9ba73SStefano Zampini 
319733c9ba73SStefano Zampini   PetscFunctionBegin;
319833c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
319933c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
320033c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
320133c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
320233c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
320333c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
320433c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
320533c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
320633c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
320733c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
320833c9ba73SStefano Zampini   PetscFunctionReturn(0);
320933c9ba73SStefano Zampini }
321033c9ba73SStefano Zampini 
32113fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
32123fa6b06aSMark Adams {
32133fa6b06aSMark Adams   PetscErrorCode             ierr;
32147e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3215a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
32167e8381f9SStefano Zampini 
32173fa6b06aSMark Adams   PetscFunctionBegin;
32183fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
32193fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
32207e8381f9SStefano Zampini     if (spptr->mat) {
32217e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
32227e8381f9SStefano Zampini       if (matrix->values) {
32237e8381f9SStefano Zampini         both = PETSC_TRUE;
32247e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32257e8381f9SStefano Zampini       }
32267e8381f9SStefano Zampini     }
32277e8381f9SStefano Zampini     if (spptr->matTranspose) {
32287e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
32297e8381f9SStefano Zampini       if (matrix->values) {
32307e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32317e8381f9SStefano Zampini       }
32327e8381f9SStefano Zampini     }
32333fa6b06aSMark Adams   }
3234a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3235a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3236a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
32377e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3238a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
32393fa6b06aSMark Adams 
32403fa6b06aSMark Adams   PetscFunctionReturn(0);
32413fa6b06aSMark Adams }
32423fa6b06aSMark Adams 
3243a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3244a587d139SMark {
3245a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3246a587d139SMark   PetscErrorCode ierr;
3247a587d139SMark 
3248a587d139SMark   PetscFunctionBegin;
3249a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3250a587d139SMark   if (flg) {
3251a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3252a587d139SMark 
325333c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3254a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3255a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3256a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3257a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3258a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3259a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3260a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3261a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3262fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3263c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3264a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3265a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3266a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3267a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3268a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3269fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3270a587d139SMark   } else {
327133c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3272a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3273a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3274a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3275a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3276a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3277a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3278a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3279a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3280fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3281c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3282a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3283a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3284a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3285a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3286a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3287fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3288a587d139SMark   }
3289a587d139SMark   A->boundtocpu = flg;
3290a587d139SMark   a->inode.use = flg;
3291a587d139SMark   PetscFunctionReturn(0);
3292a587d139SMark }
3293a587d139SMark 
329449735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32959ae82921SPaul Mullowney {
32969ae82921SPaul Mullowney   PetscErrorCode   ierr;
3297aa372e3fSPaul Mullowney   cusparseStatus_t stat;
329849735bf3SStefano Zampini   Mat              B;
32999ae82921SPaul Mullowney 
33009ae82921SPaul Mullowney   PetscFunctionBegin;
3301832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
330249735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
330349735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
330449735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
330549735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
330649735bf3SStefano Zampini   }
330749735bf3SStefano Zampini   B = *newmat;
330849735bf3SStefano Zampini 
330934136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
331034136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
331134136279SStefano Zampini 
331249735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
33139ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3314e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3315e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3316e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3317a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
33181a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3319d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3320d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3321d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3322d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3323d8132acaSStefano Zampini      #endif
33241a2c6b5cSJunchao Zhang       B->spptr = spptr;
33259ae82921SPaul Mullowney     } else {
3326e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3327e6e9a74fSStefano Zampini 
3328e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3329e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3330a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3331e6e9a74fSStefano Zampini       B->spptr = spptr;
33329ae82921SPaul Mullowney     }
3333e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
333449735bf3SStefano Zampini   }
3335693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
33369ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
33371a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
33389ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
333995639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3340693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
33412205254eSKarl Rupp 
3342e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
33439ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3344bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
33459ae82921SPaul Mullowney   PetscFunctionReturn(0);
33469ae82921SPaul Mullowney }
33479ae82921SPaul Mullowney 
334802fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
334902fe1965SBarry Smith {
335002fe1965SBarry Smith   PetscErrorCode ierr;
335102fe1965SBarry Smith 
335202fe1965SBarry Smith   PetscFunctionBegin;
335302fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
33540ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
335502fe1965SBarry Smith   PetscFunctionReturn(0);
335602fe1965SBarry Smith }
335702fe1965SBarry Smith 
33583ca39a21SBarry Smith /*MC
3359e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3360e057df02SPaul Mullowney 
3361e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
33622692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
33632692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3364e057df02SPaul Mullowney 
3365e057df02SPaul Mullowney    Options Database Keys:
3366e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3367aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3368a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3369e057df02SPaul Mullowney 
3370e057df02SPaul Mullowney   Level: beginner
3371e057df02SPaul Mullowney 
33728468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3373e057df02SPaul Mullowney M*/
33747f756511SDominic Meiser 
3375bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
33760f39cd5aSBarry Smith 
33773ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
337842c9c57cSBarry Smith {
337942c9c57cSBarry Smith   PetscErrorCode ierr;
338042c9c57cSBarry Smith 
338142c9c57cSBarry Smith   PetscFunctionBegin;
3382bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
33833ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33843ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33853ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33863ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3387bddcd29dSMark Adams 
338842c9c57cSBarry Smith   PetscFunctionReturn(0);
338942c9c57cSBarry Smith }
339029b38603SBarry Smith 
3391470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33927f756511SDominic Meiser {
3393e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33947f756511SDominic Meiser   cusparseStatus_t stat;
33957f756511SDominic Meiser 
33967f756511SDominic Meiser   PetscFunctionBegin;
33977f756511SDominic Meiser   if (*cusparsestruct) {
3398e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3399e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
34007f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
340181902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
34027e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
34037e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3404a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
34057e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3406e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
34077f756511SDominic Meiser   }
34087f756511SDominic Meiser   PetscFunctionReturn(0);
34097f756511SDominic Meiser }
34107f756511SDominic Meiser 
34117f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
34127f756511SDominic Meiser {
34137f756511SDominic Meiser   PetscFunctionBegin;
34147f756511SDominic Meiser   if (*mat) {
34157f756511SDominic Meiser     delete (*mat)->values;
34167f756511SDominic Meiser     delete (*mat)->column_indices;
34177f756511SDominic Meiser     delete (*mat)->row_offsets;
34187f756511SDominic Meiser     delete *mat;
34197f756511SDominic Meiser     *mat = 0;
34207f756511SDominic Meiser   }
34217f756511SDominic Meiser   PetscFunctionReturn(0);
34227f756511SDominic Meiser }
34237f756511SDominic Meiser 
3424470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
34257f756511SDominic Meiser {
34267f756511SDominic Meiser   cusparseStatus_t stat;
34277f756511SDominic Meiser   PetscErrorCode   ierr;
34287f756511SDominic Meiser 
34297f756511SDominic Meiser   PetscFunctionBegin;
34307f756511SDominic Meiser   if (*trifactor) {
343157d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3432afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
34337f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
34341b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
34352cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3436afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
34371b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3438afb2bd1cSJunchao Zhang    #endif
3439da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
34407f756511SDominic Meiser   }
34417f756511SDominic Meiser   PetscFunctionReturn(0);
34427f756511SDominic Meiser }
34437f756511SDominic Meiser 
3444470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
34457f756511SDominic Meiser {
34467f756511SDominic Meiser   CsrMatrix        *mat;
34477f756511SDominic Meiser   cusparseStatus_t stat;
34487f756511SDominic Meiser   cudaError_t      err;
34497f756511SDominic Meiser 
34507f756511SDominic Meiser   PetscFunctionBegin;
34517f756511SDominic Meiser   if (*matstruct) {
34527f756511SDominic Meiser     if ((*matstruct)->mat) {
34537f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3454afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3455afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3456afb2bd1cSJunchao Zhang        #else
34577f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
345857d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3459afb2bd1cSJunchao Zhang        #endif
34607f756511SDominic Meiser       } else {
34617f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
34627f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
34637f756511SDominic Meiser       }
34647f756511SDominic Meiser     }
346557d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
34667f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3467afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
34687656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
34697656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3470afb2bd1cSJunchao Zhang 
3471afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3472afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3473afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3474afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3475afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3476afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3477afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3478afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3479afb2bd1cSJunchao Zhang       }
3480afb2bd1cSJunchao Zhang     }
3481afb2bd1cSJunchao Zhang    #endif
34827f756511SDominic Meiser     delete *matstruct;
34837e8381f9SStefano Zampini     *matstruct = NULL;
34847f756511SDominic Meiser   }
34857f756511SDominic Meiser   PetscFunctionReturn(0);
34867f756511SDominic Meiser }
34877f756511SDominic Meiser 
3488*e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
34897f756511SDominic Meiser {
3490e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3491e6e9a74fSStefano Zampini 
34927f756511SDominic Meiser   PetscFunctionBegin;
34937f756511SDominic Meiser   if (*trifactors) {
3494e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3495e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3496e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3497e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
34987f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
34997f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
35007f756511SDominic Meiser     delete (*trifactors)->workVector;
35017e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
35027e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
35037e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3504bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3505bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3506*e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3507ccdfe979SStefano Zampini   }
3508ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3509ccdfe979SStefano Zampini }
3510ccdfe979SStefano Zampini 
3511ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3512ccdfe979SStefano Zampini {
3513e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3514ccdfe979SStefano Zampini   cusparseHandle_t handle;
3515ccdfe979SStefano Zampini   cusparseStatus_t stat;
3516ccdfe979SStefano Zampini 
3517ccdfe979SStefano Zampini   PetscFunctionBegin;
3518ccdfe979SStefano Zampini   if (*trifactors) {
3519e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
35207f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
352157d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
35227f756511SDominic Meiser     }
3523e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
35247f756511SDominic Meiser   }
35257f756511SDominic Meiser   PetscFunctionReturn(0);
35267f756511SDominic Meiser }
35277e8381f9SStefano Zampini 
35287e8381f9SStefano Zampini struct IJCompare
35297e8381f9SStefano Zampini {
35307e8381f9SStefano Zampini   __host__ __device__
35317e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35327e8381f9SStefano Zampini   {
35337e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
35347e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
35357e8381f9SStefano Zampini     return false;
35367e8381f9SStefano Zampini   }
35377e8381f9SStefano Zampini };
35387e8381f9SStefano Zampini 
35397e8381f9SStefano Zampini struct IJEqual
35407e8381f9SStefano Zampini {
35417e8381f9SStefano Zampini   __host__ __device__
35427e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35437e8381f9SStefano Zampini   {
35447e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
35457e8381f9SStefano Zampini     return true;
35467e8381f9SStefano Zampini   }
35477e8381f9SStefano Zampini };
35487e8381f9SStefano Zampini 
35497e8381f9SStefano Zampini struct IJDiff
35507e8381f9SStefano Zampini {
35517e8381f9SStefano Zampini   __host__ __device__
35527e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35537e8381f9SStefano Zampini   {
35547e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
35557e8381f9SStefano Zampini   }
35567e8381f9SStefano Zampini };
35577e8381f9SStefano Zampini 
35587e8381f9SStefano Zampini struct IJSum
35597e8381f9SStefano Zampini {
35607e8381f9SStefano Zampini   __host__ __device__
35617e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35627e8381f9SStefano Zampini   {
35637e8381f9SStefano Zampini     return t1||t2;
35647e8381f9SStefano Zampini   }
35657e8381f9SStefano Zampini };
35667e8381f9SStefano Zampini 
35677e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3568e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
35697e8381f9SStefano Zampini {
35707e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3571fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3572bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
357308391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35747e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35757e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35767e8381f9SStefano Zampini   cudaError_t                           cerr;
35777e8381f9SStefano Zampini   PetscInt                              n;
35787e8381f9SStefano Zampini 
35797e8381f9SStefano Zampini   PetscFunctionBegin;
35807e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35817e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35827e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35837e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35847e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35857e8381f9SStefano Zampini     PetscFunctionReturn(0);
35867e8381f9SStefano Zampini   }
35877e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35887e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3589e61fc153SStefano Zampini   if (!v) {
3590e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3591e61fc153SStefano Zampini     goto finalize;
35927e8381f9SStefano Zampini   }
3593e61fc153SStefano Zampini   n = cusp->cooPerm->size();
359408391a17SStefano Zampini   if (isCudaMem(v)) {
359508391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
359608391a17SStefano Zampini   } else {
3597e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3598e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
359908391a17SStefano Zampini     d_v = cooPerm_v->data();
3600e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
360108391a17SStefano Zampini   }
3602bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3603e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
36047e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3605bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
360608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3607e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3608e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3609e61fc153SStefano Zampini       delete cooPerm_w;
36107e8381f9SStefano Zampini     } else {
361108391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36127e8381f9SStefano Zampini                                                                 matrix->values->begin()));
361308391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36147e8381f9SStefano Zampini                                                                 matrix->values->end()));
36157e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
36167e8381f9SStefano Zampini     }
36177e8381f9SStefano Zampini   } else {
3618e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
361908391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3620e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
36217e8381f9SStefano Zampini     } else {
362208391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36237e8381f9SStefano Zampini                                                                 matrix->values->begin()));
362408391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36257e8381f9SStefano Zampini                                                                 matrix->values->end()));
36267e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
36277e8381f9SStefano Zampini     }
36287e8381f9SStefano Zampini   }
36297e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3630bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3631e61fc153SStefano Zampini finalize:
3632e61fc153SStefano Zampini   delete cooPerm_v;
36337e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3634e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3635fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3636fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3637fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3638fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3639fcdce8c4SStefano Zampini   a->reallocs         = 0;
3640fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3641fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3642fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3643fcdce8c4SStefano Zampini   A->num_ass++;
36447e8381f9SStefano Zampini   PetscFunctionReturn(0);
36457e8381f9SStefano Zampini }
36467e8381f9SStefano Zampini 
3647a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3648a49f1ed0SStefano Zampini {
3649a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3650a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3651a49f1ed0SStefano Zampini 
3652a49f1ed0SStefano Zampini   PetscFunctionBegin;
3653a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3654a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3655a49f1ed0SStefano Zampini   if (destroy) {
3656a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3657a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3658a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3659a49f1ed0SStefano Zampini   }
36601a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3661a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3662a49f1ed0SStefano Zampini }
3663a49f1ed0SStefano Zampini 
36647e8381f9SStefano Zampini #include <thrust/binary_search.h>
3665e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
36667e8381f9SStefano Zampini {
36677e8381f9SStefano Zampini   PetscErrorCode     ierr;
36687e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
36697e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
36707e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
36717e8381f9SStefano Zampini   cudaError_t        cerr;
36727e8381f9SStefano Zampini 
36737e8381f9SStefano Zampini   PetscFunctionBegin;
36747e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
36757e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
36767e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
36777e8381f9SStefano Zampini   if (n != cooPerm_n) {
36787e8381f9SStefano Zampini     delete cusp->cooPerm;
36797e8381f9SStefano Zampini     delete cusp->cooPerm_a;
36807e8381f9SStefano Zampini     cusp->cooPerm = NULL;
36817e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
36827e8381f9SStefano Zampini   }
36837e8381f9SStefano Zampini   if (n) {
36847e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
36857e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
36867e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
36877e8381f9SStefano Zampini 
36887e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
36897e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
36907e8381f9SStefano Zampini 
36917e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
36927e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
36937e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
36947e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36957e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36967e8381f9SStefano Zampini 
369708391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36987e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
36997e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
37007e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
37017e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
37027e8381f9SStefano Zampini 
37037e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
37047e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
37057e8381f9SStefano Zampini       delete cusp->cooPerm_a;
37067e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
37077e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
37087e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
37097e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
37107e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
37117e8381f9SStefano Zampini       w[0] = 0;
37127e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
37137e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
37147e8381f9SStefano Zampini     }
37157e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
37167e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
37177e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
37187e8381f9SStefano Zampini                         ii.begin());
371908391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
372008391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
37217e8381f9SStefano Zampini 
37227e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
37237e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
37247e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
37257e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
37267e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
37277e8381f9SStefano Zampini     a->i[0] = 0;
37287e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37297e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3730fcdce8c4SStefano Zampini     a->rmax = 0;
37317e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
37327e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
37337e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37347e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
37357e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
37367e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
37377e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
37387e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
37397e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3740fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
37417e8381f9SStefano Zampini     }
3742fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
37437e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
37447e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3745fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
37467e8381f9SStefano Zampini   } else {
37477e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
37487e8381f9SStefano Zampini   }
3749e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
37507e8381f9SStefano Zampini 
37517e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3752e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3753e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
37547e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
37557e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
37567e8381f9SStefano Zampini   A->nonzerostate++;
37577e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3758a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
37597e8381f9SStefano Zampini 
37607e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
37617e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
37627e8381f9SStefano Zampini   PetscFunctionReturn(0);
37637e8381f9SStefano Zampini }
3764ed502f03SStefano Zampini 
3765ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3766ed502f03SStefano Zampini {
3767ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3768ed502f03SStefano Zampini   CsrMatrix          *csr;
3769ed502f03SStefano Zampini   PetscErrorCode     ierr;
3770ed502f03SStefano Zampini 
3771ed502f03SStefano Zampini   PetscFunctionBegin;
3772ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3773ed502f03SStefano Zampini   PetscValidPointer(a,2);
3774ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3775ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3776ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
377733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3778ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3779ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3780ed502f03SStefano Zampini   *a = csr->values->data().get();
3781ed502f03SStefano Zampini   PetscFunctionReturn(0);
3782ed502f03SStefano Zampini }
3783ed502f03SStefano Zampini 
3784ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3785ed502f03SStefano Zampini {
3786ed502f03SStefano Zampini   PetscFunctionBegin;
3787ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3788ed502f03SStefano Zampini   PetscValidPointer(a,2);
3789ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3790ed502f03SStefano Zampini   *a = NULL;
3791ed502f03SStefano Zampini   PetscFunctionReturn(0);
3792ed502f03SStefano Zampini }
3793ed502f03SStefano Zampini 
3794039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3795039c6fbaSStefano Zampini {
3796039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3797039c6fbaSStefano Zampini   CsrMatrix          *csr;
3798039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3799039c6fbaSStefano Zampini 
3800039c6fbaSStefano Zampini   PetscFunctionBegin;
3801039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3802039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3803039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3804039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3805039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
380633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3807039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3808039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3809039c6fbaSStefano Zampini   *a = csr->values->data().get();
3810039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3811a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3812039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3813039c6fbaSStefano Zampini }
3814039c6fbaSStefano Zampini 
3815039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3816039c6fbaSStefano Zampini {
3817039c6fbaSStefano Zampini   PetscErrorCode ierr;
3818039c6fbaSStefano Zampini 
3819039c6fbaSStefano Zampini   PetscFunctionBegin;
3820039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3821039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3822039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3823039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3824039c6fbaSStefano Zampini   *a = NULL;
3825039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3826039c6fbaSStefano Zampini }
3827039c6fbaSStefano Zampini 
3828ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3829ed502f03SStefano Zampini {
3830ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3831ed502f03SStefano Zampini   CsrMatrix          *csr;
3832a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3833ed502f03SStefano Zampini 
3834ed502f03SStefano Zampini   PetscFunctionBegin;
3835ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3836ed502f03SStefano Zampini   PetscValidPointer(a,2);
3837ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3838ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
383933c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3840ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3841ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3842ed502f03SStefano Zampini   *a = csr->values->data().get();
3843039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3844a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3845ed502f03SStefano Zampini   PetscFunctionReturn(0);
3846ed502f03SStefano Zampini }
3847ed502f03SStefano Zampini 
3848ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3849ed502f03SStefano Zampini {
3850ed502f03SStefano Zampini   PetscErrorCode ierr;
3851ed502f03SStefano Zampini 
3852ed502f03SStefano Zampini   PetscFunctionBegin;
3853ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3854ed502f03SStefano Zampini   PetscValidPointer(a,2);
3855ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3856ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3857ed502f03SStefano Zampini   *a = NULL;
3858ed502f03SStefano Zampini   PetscFunctionReturn(0);
3859ed502f03SStefano Zampini }
3860ed502f03SStefano Zampini 
3861ed502f03SStefano Zampini struct IJCompare4
3862ed502f03SStefano Zampini {
3863ed502f03SStefano Zampini   __host__ __device__
38642ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3865ed502f03SStefano Zampini   {
3866ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3867ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3868ed502f03SStefano Zampini     return false;
3869ed502f03SStefano Zampini   }
3870ed502f03SStefano Zampini };
3871ed502f03SStefano Zampini 
38728909a122SStefano Zampini struct Shift
38738909a122SStefano Zampini {
3874ed502f03SStefano Zampini   int _shift;
3875ed502f03SStefano Zampini 
3876ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3877ed502f03SStefano Zampini   __host__ __device__
3878ed502f03SStefano Zampini   inline int operator() (const int &c)
3879ed502f03SStefano Zampini   {
3880ed502f03SStefano Zampini     return c + _shift;
3881ed502f03SStefano Zampini   }
3882ed502f03SStefano Zampini };
3883ed502f03SStefano Zampini 
3884ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3885ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3886ed502f03SStefano Zampini {
3887ed502f03SStefano Zampini   PetscErrorCode               ierr;
3888ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3889ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3890ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3891ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3892ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3893ed502f03SStefano Zampini   cusparseStatus_t             stat;
3894ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3895ed502f03SStefano Zampini   cudaError_t                  cerr;
3896ed502f03SStefano Zampini 
3897ed502f03SStefano Zampini   PetscFunctionBegin;
3898ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3899ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3900ed502f03SStefano Zampini   PetscValidPointer(C,4);
3901ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3902ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3903ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3904ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3905ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3906ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3907ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3908ed502f03SStefano Zampini     m     = A->rmap->n;
3909ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3910ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3911ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3912ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3913ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3914ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3915ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3916ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3917ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3918ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3919ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3920ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3921ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3922ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3923ed502f03SStefano Zampini     Ccusp->nrows    = m;
3924ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3925ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3926ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3927ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3928ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3929ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3930ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3931ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3932ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3933ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3934ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3935ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3936ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3937ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3938ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
39391a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
39401a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3941ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3942ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3943ed502f03SStefano Zampini 
3944ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3945ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3946ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3947ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3948ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3949ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3950ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3951ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3952ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3953ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3954ed502f03SStefano Zampini     if (c->nz) {
39552ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
39562ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
39572ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
39582ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
39592ed87e7eSStefano Zampini 
3960ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3961ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3962ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3963ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3964ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3965ed502f03SStefano Zampini         }
39662ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
39672ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3968ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3969ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3970ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3971ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3972ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3973ed502f03SStefano Zampini         }
39742ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
39752ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3976ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39772ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
39782ed87e7eSStefano Zampini                               Aroff->data().get(),
39792ed87e7eSStefano Zampini                               Annz,
39802ed87e7eSStefano Zampini                               m,
39812ed87e7eSStefano Zampini                               Acoo->data().get(),
39822ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3983ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
39842ed87e7eSStefano Zampini                               Broff->data().get(),
3985ed502f03SStefano Zampini                               Bnnz,
3986ed502f03SStefano Zampini                               m,
39872ed87e7eSStefano Zampini                               Bcoo->data().get(),
3988ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
39892ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
39902ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
39912ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
39928909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3993ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3994ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
39958909a122SStefano Zampini #else
39968909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
39978909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39988909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
39998909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
40008909a122SStefano Zampini #endif
40012ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
40022ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
40032ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
40042ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
40052ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
40062ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4007ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4008ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4009ed502f03SStefano Zampini       thrust::advance(p2,Annz);
40102ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
40118909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
40128909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
40138909a122SStefano Zampini #endif
40142ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
40152ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
40162ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
40172ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
40182ed87e7eSStefano Zampini #else
40192ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
40202ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
40212ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
40222ed87e7eSStefano Zampini #endif
4023ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
40242ed87e7eSStefano Zampini                               Ccoo->data().get(),
4025ed502f03SStefano Zampini                               c->nz,
4026ed502f03SStefano Zampini                               m,
4027ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4028ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4029ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4030ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40312ed87e7eSStefano Zampini       delete wPerm;
40322ed87e7eSStefano Zampini       delete Acoo;
40332ed87e7eSStefano Zampini       delete Bcoo;
40342ed87e7eSStefano Zampini       delete Ccoo;
4035ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4036ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4037ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4038ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4039ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4040ed502f03SStefano Zampini #endif
40411a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4042ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4043ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4044ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4045ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4046ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4047ed502f03SStefano Zampini 
40481a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
40491a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4050a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4051ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4052ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4053ed502f03SStefano Zampini         CcsrT->num_rows = n;
4054ed502f03SStefano Zampini         CcsrT->num_cols = m;
4055ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4056ed502f03SStefano Zampini 
4057ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4058ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4059ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4060ed502f03SStefano Zampini 
4061ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4062ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4063ed502f03SStefano Zampini         if (AT) {
4064ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4065ed502f03SStefano Zampini           thrust::advance(rT,-1);
4066ed502f03SStefano Zampini         }
4067ed502f03SStefano Zampini         if (BT) {
4068ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4069ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4070ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4071ed502f03SStefano Zampini         }
4072ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4073ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4074ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4075ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4076ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4077ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4078ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4079ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4080ed502f03SStefano Zampini 
4081ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4082ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4083ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4084ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4085ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4086ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4087ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4088ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4089ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4090ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4091ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4092ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4093ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4094ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4095ed502f03SStefano Zampini #endif
4096ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4097ed502f03SStefano Zampini       }
4098ed502f03SStefano Zampini     }
4099ed502f03SStefano Zampini 
4100ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4101ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4102ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4103ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4104ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4105ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4106ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4107ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4108ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4109ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4110ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4111ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4112ed502f03SStefano Zampini     } else {
4113ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4114ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4115ed502f03SStefano Zampini     }
4116ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4117ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4118ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4119ed502f03SStefano Zampini     c->maxnz = c->nz;
4120ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4121ed502f03SStefano Zampini     c->rmax = 0;
4122ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4123ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4124ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4125ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4126ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4127ed502f03SStefano Zampini     }
4128ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4129ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4130ed502f03SStefano Zampini     (*C)->nonzerostate++;
4131ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4132ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4133ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4134ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4135ed502f03SStefano Zampini   } else {
4136ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4137ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4138ed502f03SStefano Zampini     if (c->nz) {
4139ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4140ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4141ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4142ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4143ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4144ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4145ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4146ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4147ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4148ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4149ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4150ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4151ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4152ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4153ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4154ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4155ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4156ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4157ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4158ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4159ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4160ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4161ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4162ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4163ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4164ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4165ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4166ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4167ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4168a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
41691a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4170ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4171ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4172ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4173ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4174ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4175ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4176ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4177ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
41781a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4179ed502f03SStefano Zampini       }
4180ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4181ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4182ed502f03SStefano Zampini     }
4183ed502f03SStefano Zampini   }
4184ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4185ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4186ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4187ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4188ed502f03SStefano Zampini   PetscFunctionReturn(0);
4189ed502f03SStefano Zampini }
4190c215019aSStefano Zampini 
4191c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4192c215019aSStefano Zampini {
4193c215019aSStefano Zampini   PetscErrorCode    ierr;
4194c215019aSStefano Zampini   bool              dmem;
4195c215019aSStefano Zampini   const PetscScalar *av;
4196c215019aSStefano Zampini   cudaError_t       cerr;
4197c215019aSStefano Zampini 
4198c215019aSStefano Zampini   PetscFunctionBegin;
4199c215019aSStefano Zampini   dmem = isCudaMem(v);
4200c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4201c215019aSStefano Zampini   if (n && idx) {
4202c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4203c215019aSStefano Zampini     widx.assign(idx,idx+n);
4204c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4205c215019aSStefano Zampini 
4206c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4207c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4208c215019aSStefano Zampini     if (dmem) {
4209c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4210c215019aSStefano Zampini     } else {
4211c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4212c215019aSStefano Zampini       dv = w->data();
4213c215019aSStefano Zampini     }
4214c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4215c215019aSStefano Zampini 
4216c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4217c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4218c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4219c215019aSStefano Zampini     if (w) {
4220c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4221c215019aSStefano Zampini     }
4222c215019aSStefano Zampini     delete w;
4223c215019aSStefano Zampini   } else {
4224c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4225c215019aSStefano Zampini   }
4226c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4227c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4228c215019aSStefano Zampini   PetscFunctionReturn(0);
4229c215019aSStefano Zampini }
4230