xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 7487cd7ca1dddf3cbc146be559ee2e39856c5efc)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscCount,const PetscInt[],const PetscInt[]);
93219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
94c215019aSStefano Zampini 
95b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
96b06137fdSPaul Mullowney {
97b06137fdSPaul Mullowney   cusparseStatus_t   stat;
98b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
99b06137fdSPaul Mullowney 
100b06137fdSPaul Mullowney   PetscFunctionBegin;
1012c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
102b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10357d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
104b06137fdSPaul Mullowney   PetscFunctionReturn(0);
105b06137fdSPaul Mullowney }
106b06137fdSPaul Mullowney 
107b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
108b06137fdSPaul Mullowney {
109b06137fdSPaul Mullowney   cusparseStatus_t   stat;
110b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
111b06137fdSPaul Mullowney 
112b06137fdSPaul Mullowney   PetscFunctionBegin;
1132c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparsestruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1146b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11516a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11657d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11716a2e217SAlejandro Lamas Daviña     }
118b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1196b1cf21dSAlejandro Lamas Daviña   }
12057d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
121b06137fdSPaul Mullowney   PetscFunctionReturn(0);
122b06137fdSPaul Mullowney }
123b06137fdSPaul Mullowney 
124b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
125b06137fdSPaul Mullowney {
126b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1277e8381f9SStefano Zampini   PetscBool          flg;
1287e8381f9SStefano Zampini   PetscErrorCode     ierr;
129ccdfe979SStefano Zampini 
130b06137fdSPaul Mullowney   PetscFunctionBegin;
1317e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1327e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
133ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
134b06137fdSPaul Mullowney   PetscFunctionReturn(0);
135b06137fdSPaul Mullowney }
136b06137fdSPaul Mullowney 
137ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1389ae82921SPaul Mullowney {
1399ae82921SPaul Mullowney   PetscFunctionBegin;
1409ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1419ae82921SPaul Mullowney   PetscFunctionReturn(0);
1429ae82921SPaul Mullowney }
1439ae82921SPaul Mullowney 
144c708e6cdSJed Brown /*MC
145087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
146087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
147087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
148087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
149087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
150087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
151c708e6cdSJed Brown 
1529ae82921SPaul Mullowney   Level: beginner
153c708e6cdSJed Brown 
1543ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
155c708e6cdSJed Brown M*/
1569ae82921SPaul Mullowney 
15742c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1589ae82921SPaul Mullowney {
1599ae82921SPaul Mullowney   PetscErrorCode ierr;
160bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1619ae82921SPaul Mullowney 
1629ae82921SPaul Mullowney   PetscFunctionBegin;
163bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
164bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1652c7c0729SBarry Smith   (*B)->factortype = ftype;
1669ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1672205254eSKarl Rupp 
1689c1083e7SRichard Tran Mills   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
169087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
17033d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1719c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1729ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1739ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1749c1083e7SRichard Tran Mills     } else {
1759c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1769c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1779c1083e7SRichard Tran Mills     }
1784ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1794ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1804ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
181087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1829c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
183087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
184087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1859c1083e7SRichard Tran Mills     } else {
1869c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1879c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1889c1083e7SRichard Tran Mills     }
1894ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1904ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1919ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
192bc3f50f2SPaul Mullowney 
193fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1944ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1953ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1969ae82921SPaul Mullowney   PetscFunctionReturn(0);
1979ae82921SPaul Mullowney }
1989ae82921SPaul Mullowney 
199bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
200ca45077fSPaul Mullowney {
201aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2026e111a19SKarl Rupp 
203ca45077fSPaul Mullowney   PetscFunctionBegin;
204ca45077fSPaul Mullowney   switch (op) {
205e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
206aa372e3fSPaul Mullowney     cusparsestruct->format = format;
207ca45077fSPaul Mullowney     break;
208e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
209aa372e3fSPaul Mullowney     cusparsestruct->format = format;
210ca45077fSPaul Mullowney     break;
211ca45077fSPaul Mullowney   default:
21298921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
213ca45077fSPaul Mullowney   }
214ca45077fSPaul Mullowney   PetscFunctionReturn(0);
215ca45077fSPaul Mullowney }
2169ae82921SPaul Mullowney 
217e057df02SPaul Mullowney /*@
218e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
219e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
220aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
221e057df02SPaul Mullowney    Not Collective
222e057df02SPaul Mullowney 
223e057df02SPaul Mullowney    Input Parameters:
2248468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
22536d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2262692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
227e057df02SPaul Mullowney 
228e057df02SPaul Mullowney    Output Parameter:
229e057df02SPaul Mullowney 
230e057df02SPaul Mullowney    Level: intermediate
231e057df02SPaul Mullowney 
2328468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
233e057df02SPaul Mullowney @*/
234e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
235e057df02SPaul Mullowney {
236e057df02SPaul Mullowney   PetscErrorCode ierr;
2376e111a19SKarl Rupp 
238e057df02SPaul Mullowney   PetscFunctionBegin;
239e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
240e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
241e057df02SPaul Mullowney   PetscFunctionReturn(0);
242e057df02SPaul Mullowney }
243e057df02SPaul Mullowney 
244365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
245365b711fSMark Adams {
246365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
247365b711fSMark Adams 
248365b711fSMark Adams   PetscFunctionBegin;
249365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
250365b711fSMark Adams   PetscFunctionReturn(0);
251365b711fSMark Adams }
252365b711fSMark Adams 
253365b711fSMark Adams /*@
254365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
255365b711fSMark Adams 
256365b711fSMark Adams    Input Parameters:
257365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
258365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
259365b711fSMark Adams 
260365b711fSMark Adams    Output Parameter:
261365b711fSMark Adams 
262365b711fSMark Adams    Notes:
263365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
264365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
265365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
266365b711fSMark Adams 
267365b711fSMark Adams    Level: intermediate
268365b711fSMark Adams 
269365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
270365b711fSMark Adams @*/
271365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
272365b711fSMark Adams {
273365b711fSMark Adams   PetscErrorCode ierr;
274365b711fSMark Adams 
275365b711fSMark Adams   PetscFunctionBegin;
276365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
277365b711fSMark Adams   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
278365b711fSMark Adams   PetscFunctionReturn(0);
279365b711fSMark Adams }
280365b711fSMark Adams 
2811a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
282e6e9a74fSStefano Zampini {
283e6e9a74fSStefano Zampini   PetscErrorCode ierr;
284e6e9a74fSStefano Zampini 
285e6e9a74fSStefano Zampini   PetscFunctionBegin;
2861a2c6b5cSJunchao Zhang   switch (op) {
2871a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2881a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2891a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2901a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2911a2c6b5cSJunchao Zhang       break;
2921a2c6b5cSJunchao Zhang     default:
2931a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2941a2c6b5cSJunchao Zhang       break;
295e6e9a74fSStefano Zampini   }
296e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
297e6e9a74fSStefano Zampini }
298e6e9a74fSStefano Zampini 
299bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
300bddcd29dSMark Adams 
301bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
302bddcd29dSMark Adams {
303bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
304bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
305bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
306365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
307bddcd29dSMark Adams   PetscErrorCode ierr;
308bddcd29dSMark Adams 
309bddcd29dSMark Adams   PetscFunctionBegin;
310bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
311bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
312bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
313bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
314bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
315bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
316bddcd29dSMark Adams   if (row_identity && col_identity) {
317365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
318bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
319bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
320365b711fSMark Adams     }
321bddcd29dSMark Adams     B->ops->matsolve = NULL;
322bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
323bddcd29dSMark Adams   } else {
324365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
325bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
326bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
327365b711fSMark Adams     }
328bddcd29dSMark Adams     B->ops->matsolve = NULL;
329bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
330bddcd29dSMark Adams   }
331bddcd29dSMark Adams 
332bddcd29dSMark Adams   /* get the triangular factors */
333365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
334bddcd29dSMark Adams     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
335365b711fSMark Adams   }
336bddcd29dSMark Adams   PetscFunctionReturn(0);
337bddcd29dSMark Adams }
338bddcd29dSMark Adams 
3394416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
3409ae82921SPaul Mullowney {
3419ae82921SPaul Mullowney   PetscErrorCode           ierr;
342e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
3439ae82921SPaul Mullowney   PetscBool                flg;
344a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3456e111a19SKarl Rupp 
3469ae82921SPaul Mullowney   PetscFunctionBegin;
347e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
3489ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
349e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
350a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
351afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
352afb2bd1cSJunchao Zhang 
3534c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
354a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
355afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
356365b711fSMark Adams     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
357365b711fSMark Adams     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
358afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
359afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
360afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
361afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
362a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3632c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMV_CSR_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
364a435da06SStefano Zampini #else
3652c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSRMV_ALG1 != 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
366a435da06SStefano Zampini #endif
367afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
368afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
3692c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_SPMM_CSR_ALG1 != 4,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
370afb2bd1cSJunchao Zhang 
371afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
372afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
3732c71b3e2SJacob Faibussowitsch     PetscCheckFalse(flg && CUSPARSE_CSR2CSC_ALG1 != 1,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
374afb2bd1cSJunchao Zhang    #endif
3754c87dfd4SPaul Mullowney   }
3760af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3779ae82921SPaul Mullowney   PetscFunctionReturn(0);
3789ae82921SPaul Mullowney }
3799ae82921SPaul Mullowney 
3806fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3819ae82921SPaul Mullowney {
382da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3839ae82921SPaul Mullowney   PetscErrorCode               ierr;
3849ae82921SPaul Mullowney 
3859ae82921SPaul Mullowney   PetscFunctionBegin;
386da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3879ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3889ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3899ae82921SPaul Mullowney   PetscFunctionReturn(0);
3909ae82921SPaul Mullowney }
3919ae82921SPaul Mullowney 
3926fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3939ae82921SPaul Mullowney {
394da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3959ae82921SPaul Mullowney   PetscErrorCode               ierr;
3969ae82921SPaul Mullowney 
3979ae82921SPaul Mullowney   PetscFunctionBegin;
398da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3999ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
4009ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
4019ae82921SPaul Mullowney   PetscFunctionReturn(0);
4029ae82921SPaul Mullowney }
4039ae82921SPaul Mullowney 
404087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
405087f3262SPaul Mullowney {
406da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
407087f3262SPaul Mullowney   PetscErrorCode               ierr;
408087f3262SPaul Mullowney 
409087f3262SPaul Mullowney   PetscFunctionBegin;
410da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
411087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
412087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
413087f3262SPaul Mullowney   PetscFunctionReturn(0);
414087f3262SPaul Mullowney }
415087f3262SPaul Mullowney 
416087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
417087f3262SPaul Mullowney {
418da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
419087f3262SPaul Mullowney   PetscErrorCode               ierr;
420087f3262SPaul Mullowney 
421087f3262SPaul Mullowney   PetscFunctionBegin;
422da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
423087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
424087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
425087f3262SPaul Mullowney   PetscFunctionReturn(0);
426087f3262SPaul Mullowney }
427087f3262SPaul Mullowney 
428087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
4299ae82921SPaul Mullowney {
4309ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4319ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4329ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
433aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
4349ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4359ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
4369ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4379ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
4389ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
439b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
44057d48284SJunchao Zhang   cudaError_t                       cerr;
4419ae82921SPaul Mullowney 
4429ae82921SPaul Mullowney   PetscFunctionBegin;
443cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
444c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4459ae82921SPaul Mullowney     try {
4469ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
4479ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
448da79fbbcSStefano Zampini       if (!loTriFactor) {
4492cbc15d9SMark         PetscScalar                       *AALo;
4502cbc15d9SMark 
4512cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4529ae82921SPaul Mullowney 
4539ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
45457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
45557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
4569ae82921SPaul Mullowney 
4579ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4589ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4599ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4609ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4619ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4629ae82921SPaul Mullowney         v        = aa;
4639ae82921SPaul Mullowney         vi       = aj;
4649ae82921SPaul Mullowney         offset   = 1;
4659ae82921SPaul Mullowney         rowOffset= 1;
4669ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4679ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
468e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4699ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4709ae82921SPaul Mullowney           rowOffset += nz+1;
4719ae82921SPaul Mullowney 
472580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
473580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4749ae82921SPaul Mullowney 
4759ae82921SPaul Mullowney           offset      += nz;
4769ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4779ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4789ae82921SPaul Mullowney           offset      += 1;
4799ae82921SPaul Mullowney 
4809ae82921SPaul Mullowney           v  += nz;
4819ae82921SPaul Mullowney           vi += nz;
4829ae82921SPaul Mullowney         }
4832205254eSKarl Rupp 
484aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
485da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
486da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
487aa372e3fSPaul Mullowney         /* Create the matrix description */
48857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
48957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4901b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
491afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
492afb2bd1cSJunchao Zhang        #else
49357d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
494afb2bd1cSJunchao Zhang        #endif
49557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
49657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
497aa372e3fSPaul Mullowney 
498aa372e3fSPaul Mullowney         /* set the operation */
499aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
500aa372e3fSPaul Mullowney 
501aa372e3fSPaul Mullowney         /* set the matrix */
502aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
503aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
504aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
505aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
506aa372e3fSPaul Mullowney 
507aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
508aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
509aa372e3fSPaul Mullowney 
510aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
511aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
512aa372e3fSPaul Mullowney 
513aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
514aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
515aa372e3fSPaul Mullowney 
516afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
517da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
518afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5191b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
520afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
521afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
522afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
523afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
524afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
525afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
526afb2bd1cSJunchao Zhang       #endif
527afb2bd1cSJunchao Zhang 
528aa372e3fSPaul Mullowney         /* perform the solve analysis */
529aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
530aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
531aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
532d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
5331b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
534d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
535d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
536d49cd2b7SBarry Smith                                #else
537d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
538afb2bd1cSJunchao Zhang                                #endif
539da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
540da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
541aa372e3fSPaul Mullowney 
542da79fbbcSStefano Zampini         /* assign the pointer */
543aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
5442cbc15d9SMark         loTriFactor->AA_h = AALo;
54557d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
54657d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
5474863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
548da79fbbcSStefano Zampini       } else { /* update values only */
5492cbc15d9SMark         if (!loTriFactor->AA_h) {
5502cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
5512cbc15d9SMark         }
552da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
5532cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
554da79fbbcSStefano Zampini         v        = aa;
555da79fbbcSStefano Zampini         vi       = aj;
556da79fbbcSStefano Zampini         offset   = 1;
557da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
558da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5592cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
560da79fbbcSStefano Zampini           offset      += nz;
5612cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
562da79fbbcSStefano Zampini           offset      += 1;
563da79fbbcSStefano Zampini           v  += nz;
564da79fbbcSStefano Zampini         }
5652cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
566da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
567da79fbbcSStefano Zampini       }
5689ae82921SPaul Mullowney     } catch(char *ex) {
56998921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5709ae82921SPaul Mullowney     }
5719ae82921SPaul Mullowney   }
5729ae82921SPaul Mullowney   PetscFunctionReturn(0);
5739ae82921SPaul Mullowney }
5749ae82921SPaul Mullowney 
575087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5769ae82921SPaul Mullowney {
5779ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5789ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5799ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
580aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5819ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5829ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5839ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5849ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5859ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5869ae82921SPaul Mullowney   PetscErrorCode                    ierr;
58757d48284SJunchao Zhang   cudaError_t                       cerr;
5889ae82921SPaul Mullowney 
5899ae82921SPaul Mullowney   PetscFunctionBegin;
590cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
591c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5929ae82921SPaul Mullowney     try {
5939ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5949ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
595da79fbbcSStefano Zampini       if (!upTriFactor) {
5962cbc15d9SMark         PetscScalar *AAUp;
5972cbc15d9SMark 
5982cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5992cbc15d9SMark 
6009ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
60157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
60257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
6039ae82921SPaul Mullowney 
6049ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
6059ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
6069ae82921SPaul Mullowney         AiUp[n]=nzUpper;
6079ae82921SPaul Mullowney         offset = nzUpper;
6089ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
6099ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
6109ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
6119ae82921SPaul Mullowney 
612e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
6139ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
6149ae82921SPaul Mullowney 
615e057df02SPaul Mullowney           /* decrement the offset */
6169ae82921SPaul Mullowney           offset -= (nz+1);
6179ae82921SPaul Mullowney 
618e057df02SPaul Mullowney           /* first, set the diagonal elements */
6199ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
62009f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
6219ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
6229ae82921SPaul Mullowney 
623580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
624580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
6259ae82921SPaul Mullowney         }
6262205254eSKarl Rupp 
627aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
628da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
629da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
6302205254eSKarl Rupp 
631aa372e3fSPaul Mullowney         /* Create the matrix description */
63257d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
63357d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
6341b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
635afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
636afb2bd1cSJunchao Zhang        #else
63757d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
638afb2bd1cSJunchao Zhang        #endif
63957d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
64057d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
641aa372e3fSPaul Mullowney 
642aa372e3fSPaul Mullowney         /* set the operation */
643aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
644aa372e3fSPaul Mullowney 
645aa372e3fSPaul Mullowney         /* set the matrix */
646aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
647aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
648aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
649aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
650aa372e3fSPaul Mullowney 
651aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
652aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
653aa372e3fSPaul Mullowney 
654aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
655aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
656aa372e3fSPaul Mullowney 
657aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
658aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
659aa372e3fSPaul Mullowney 
660afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
661da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
662afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6631b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
664afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
665afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
666afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
667afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
668afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
669afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
670afb2bd1cSJunchao Zhang       #endif
671afb2bd1cSJunchao Zhang 
672aa372e3fSPaul Mullowney         /* perform the solve analysis */
673aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
674aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
675aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
676d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6771b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
678d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
679d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
680d49cd2b7SBarry Smith                                #else
681d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
682afb2bd1cSJunchao Zhang                                #endif
683da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
684da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
685aa372e3fSPaul Mullowney 
686da79fbbcSStefano Zampini         /* assign the pointer */
687aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6882cbc15d9SMark         upTriFactor->AA_h = AAUp;
68957d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
69057d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6914863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
692da79fbbcSStefano Zampini       } else {
6932cbc15d9SMark         if (!upTriFactor->AA_h) {
6942cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6952cbc15d9SMark         }
696da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
697da79fbbcSStefano Zampini         offset = nzUpper;
698da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
699da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
700da79fbbcSStefano Zampini 
701da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
702da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
703da79fbbcSStefano Zampini 
704da79fbbcSStefano Zampini           /* decrement the offset */
705da79fbbcSStefano Zampini           offset -= (nz+1);
706da79fbbcSStefano Zampini 
707da79fbbcSStefano Zampini           /* first, set the diagonal elements */
7082cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
7092cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
710da79fbbcSStefano Zampini         }
7112cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
712da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
713da79fbbcSStefano Zampini       }
7149ae82921SPaul Mullowney     } catch(char *ex) {
71598921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7169ae82921SPaul Mullowney     }
7179ae82921SPaul Mullowney   }
7189ae82921SPaul Mullowney   PetscFunctionReturn(0);
7199ae82921SPaul Mullowney }
7209ae82921SPaul Mullowney 
721087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
7229ae82921SPaul Mullowney {
7239ae82921SPaul Mullowney   PetscErrorCode               ierr;
7249ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
7259ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7269ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
7279ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
7289ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
7299ae82921SPaul Mullowney 
7309ae82921SPaul Mullowney   PetscFunctionBegin;
7312c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
732087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
733087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
7342205254eSKarl Rupp 
735da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
736aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
7379ae82921SPaul Mullowney 
738c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
739e057df02SPaul Mullowney   /* lower triangular indices */
7409ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
741da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
742da79fbbcSStefano Zampini     const PetscInt *r;
743da79fbbcSStefano Zampini 
744da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
745aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
746aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
7479ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
748da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
749da79fbbcSStefano Zampini   }
7509ae82921SPaul Mullowney 
751e057df02SPaul Mullowney   /* upper triangular indices */
7529ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
753da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
754da79fbbcSStefano Zampini     const PetscInt *c;
755da79fbbcSStefano Zampini 
756da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
757aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
758aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
7599ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
760da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
761da79fbbcSStefano Zampini   }
7629ae82921SPaul Mullowney   PetscFunctionReturn(0);
7639ae82921SPaul Mullowney }
7649ae82921SPaul Mullowney 
765087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
766087f3262SPaul Mullowney {
767087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
768087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
769aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
770aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
771087f3262SPaul Mullowney   cusparseStatus_t                  stat;
772087f3262SPaul Mullowney   PetscErrorCode                    ierr;
77357d48284SJunchao Zhang   cudaError_t                       cerr;
774087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
775087f3262SPaul Mullowney   PetscScalar                       *AAUp;
776087f3262SPaul Mullowney   PetscScalar                       *AALo;
777087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
778087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
779087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
780087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
781087f3262SPaul Mullowney 
782087f3262SPaul Mullowney   PetscFunctionBegin;
783cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
784c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
785087f3262SPaul Mullowney     try {
786da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
787da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
788da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
789087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
79057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
79157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
792087f3262SPaul Mullowney 
793087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
794087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
795087f3262SPaul Mullowney         AiUp[n]=nzUpper;
796087f3262SPaul Mullowney         offset = 0;
797087f3262SPaul Mullowney         for (i=0; i<n; i++) {
798087f3262SPaul Mullowney           /* set the pointers */
799087f3262SPaul Mullowney           v  = aa + ai[i];
800087f3262SPaul Mullowney           vj = aj + ai[i];
801087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
802087f3262SPaul Mullowney 
803087f3262SPaul Mullowney           /* first, set the diagonal elements */
804087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
80509f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
806087f3262SPaul Mullowney           AiUp[i]      = offset;
80709f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
808087f3262SPaul Mullowney 
809087f3262SPaul Mullowney           offset+=1;
810087f3262SPaul Mullowney           if (nz>0) {
811f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
812580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
813087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
814087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
815087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
816087f3262SPaul Mullowney             }
817087f3262SPaul Mullowney             offset+=nz;
818087f3262SPaul Mullowney           }
819087f3262SPaul Mullowney         }
820087f3262SPaul Mullowney 
821aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
822da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
823da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
824087f3262SPaul Mullowney 
825aa372e3fSPaul Mullowney         /* Create the matrix description */
82657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
82757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8281b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
829afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
830afb2bd1cSJunchao Zhang        #else
83157d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
832afb2bd1cSJunchao Zhang        #endif
83357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
835087f3262SPaul Mullowney 
836aa372e3fSPaul Mullowney         /* set the matrix */
837aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
838aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
839aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
840aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
841aa372e3fSPaul Mullowney 
842aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
844aa372e3fSPaul Mullowney 
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
846aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
847aa372e3fSPaul Mullowney 
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
849aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
850aa372e3fSPaul Mullowney 
851afb2bd1cSJunchao Zhang         /* set the operation */
852afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
853afb2bd1cSJunchao Zhang 
854afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
855da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
856afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8571b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
858afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
859afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
860afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
861afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
862afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
863afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
864afb2bd1cSJunchao Zhang       #endif
865afb2bd1cSJunchao Zhang 
866aa372e3fSPaul Mullowney         /* perform the solve analysis */
867aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
868aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
869aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
870d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8711b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
872d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
873d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
874d49cd2b7SBarry Smith                                 #else
875d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
876afb2bd1cSJunchao Zhang                                 #endif
877da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
878da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
879aa372e3fSPaul Mullowney 
880da79fbbcSStefano Zampini         /* assign the pointer */
881aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
882aa372e3fSPaul Mullowney 
883aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
884da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
885da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
886aa372e3fSPaul Mullowney 
887aa372e3fSPaul Mullowney         /* Create the matrix description */
88857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
88957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8901b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
891afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
892afb2bd1cSJunchao Zhang        #else
89357d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
894afb2bd1cSJunchao Zhang        #endif
89557d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
89657d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
897aa372e3fSPaul Mullowney 
898aa372e3fSPaul Mullowney         /* set the operation */
899aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
900aa372e3fSPaul Mullowney 
901aa372e3fSPaul Mullowney         /* set the matrix */
902aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
904aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
906aa372e3fSPaul Mullowney 
907aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
908aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
909aa372e3fSPaul Mullowney 
910aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
911aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
912aa372e3fSPaul Mullowney 
913aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
914aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
915aa372e3fSPaul Mullowney 
916afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
917da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
918afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
9191b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
920afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
921afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
922afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
923afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
924afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
925afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
926afb2bd1cSJunchao Zhang       #endif
927afb2bd1cSJunchao Zhang 
928aa372e3fSPaul Mullowney         /* perform the solve analysis */
929aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
930aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
931aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
932d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
9331b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
934d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
935d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
936d49cd2b7SBarry Smith                                 #else
937d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
938afb2bd1cSJunchao Zhang                                 #endif
939da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
940da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
941aa372e3fSPaul Mullowney 
942da79fbbcSStefano Zampini         /* assign the pointer */
943aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
944087f3262SPaul Mullowney 
945da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
94657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
94757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
948da79fbbcSStefano Zampini       } else {
949da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
950da79fbbcSStefano Zampini         offset = 0;
951da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
952da79fbbcSStefano Zampini           /* set the pointers */
953da79fbbcSStefano Zampini           v  = aa + ai[i];
954da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
955da79fbbcSStefano Zampini 
956da79fbbcSStefano Zampini           /* first, set the diagonal elements */
957da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
958da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
959da79fbbcSStefano Zampini 
960da79fbbcSStefano Zampini           offset+=1;
961da79fbbcSStefano Zampini           if (nz>0) {
962da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
963da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
964da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
965da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
966da79fbbcSStefano Zampini             }
967da79fbbcSStefano Zampini             offset+=nz;
968da79fbbcSStefano Zampini           }
969da79fbbcSStefano Zampini         }
9702c71b3e2SJacob Faibussowitsch         PetscCheckFalse(!upTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
9712c71b3e2SJacob Faibussowitsch         PetscCheckFalse(!loTriFactor,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
972da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
973da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
974da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
975da79fbbcSStefano Zampini       }
97657d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
97757d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
978087f3262SPaul Mullowney     } catch(char *ex) {
97998921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
980087f3262SPaul Mullowney     }
981087f3262SPaul Mullowney   }
982087f3262SPaul Mullowney   PetscFunctionReturn(0);
983087f3262SPaul Mullowney }
984087f3262SPaul Mullowney 
985087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9869ae82921SPaul Mullowney {
9879ae82921SPaul Mullowney   PetscErrorCode               ierr;
988087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
989087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
990087f3262SPaul Mullowney   IS                           ip = a->row;
991087f3262SPaul Mullowney   PetscBool                    perm_identity;
992087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
993087f3262SPaul Mullowney 
994087f3262SPaul Mullowney   PetscFunctionBegin;
9952c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparseTriFactors,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
996087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
997da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
998aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
999aa372e3fSPaul Mullowney 
1000da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
1001da79fbbcSStefano Zampini 
1002087f3262SPaul Mullowney   /* lower triangular indices */
1003087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1004087f3262SPaul Mullowney   if (!perm_identity) {
10054e4bbfaaSStefano Zampini     IS             iip;
1006da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
10074e4bbfaaSStefano Zampini 
10084e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
10094e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1010da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1011aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1012aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1013aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10144e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
10154e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
10164e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1017087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1018da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1019da79fbbcSStefano Zampini   }
1020087f3262SPaul Mullowney   PetscFunctionReturn(0);
1021087f3262SPaul Mullowney }
1022087f3262SPaul Mullowney 
1023087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1024087f3262SPaul Mullowney {
1025087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1026087f3262SPaul Mullowney   IS             ip = b->row;
1027087f3262SPaul Mullowney   PetscBool      perm_identity;
1028b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1029087f3262SPaul Mullowney 
1030087f3262SPaul Mullowney   PetscFunctionBegin;
103157181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1032087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1033ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1034087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1035087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1036087f3262SPaul Mullowney   if (perm_identity) {
1037087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1038087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
10394e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10404e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1041087f3262SPaul Mullowney   } else {
1042087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1043087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
10444e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10454e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1046087f3262SPaul Mullowney   }
1047087f3262SPaul Mullowney 
1048087f3262SPaul Mullowney   /* get the triangular factors */
1049087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1050087f3262SPaul Mullowney   PetscFunctionReturn(0);
1051087f3262SPaul Mullowney }
10529ae82921SPaul Mullowney 
1053b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1054bda325fcSPaul Mullowney {
1055bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1056aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1057aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1058da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1059da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1060bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1061aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1062aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1063aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1064aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10651b0a6780SStefano Zampini   cudaError_t                       cerr;
1066da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1067b175d8bbSPaul Mullowney 
1068bda325fcSPaul Mullowney   PetscFunctionBegin;
1069aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1070da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1071da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1072aa372e3fSPaul Mullowney 
1073aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1074aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1075aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1076aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1077aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1078aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1079aa372e3fSPaul Mullowney 
1080aa372e3fSPaul Mullowney   /* Create the matrix description */
108157d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
108257d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
108357d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
108457d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
108557d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1086aa372e3fSPaul Mullowney 
1087aa372e3fSPaul Mullowney   /* set the operation */
1088aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1089aa372e3fSPaul Mullowney 
1090aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1091aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1092afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1093afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1094aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1095afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1096afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1097afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1098aa372e3fSPaul Mullowney 
1099aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1100afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1101afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1102afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1103afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1104afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1105afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1106afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1107afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1108afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1109afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
11101b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1111afb2bd1cSJunchao Zhang #endif
1112afb2bd1cSJunchao Zhang 
1113da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1114aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1115aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1116aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1117aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1118aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1119aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1120afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1121afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1122afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1123d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1124afb2bd1cSJunchao Zhang                         #else
1125afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1126d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1127afb2bd1cSJunchao Zhang                         #endif
1128da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1129da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1130aa372e3fSPaul Mullowney 
1131afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1132da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1133afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11341b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1135afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1136afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1137afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1138afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1139afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1140afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1141afb2bd1cSJunchao Zhang #endif
1142afb2bd1cSJunchao Zhang 
1143afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1144aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1145afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1146afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1147d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
11481b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1149d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1150d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1151d49cd2b7SBarry Smith                           #else
1152d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1153afb2bd1cSJunchao Zhang                           #endif
1154da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1155da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1156aa372e3fSPaul Mullowney 
1157da79fbbcSStefano Zampini   /* assign the pointer */
1158aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1159aa372e3fSPaul Mullowney 
1160aa372e3fSPaul Mullowney   /*********************************************/
1161aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1162aa372e3fSPaul Mullowney   /*********************************************/
1163aa372e3fSPaul Mullowney 
1164aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1165da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1166da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1167aa372e3fSPaul Mullowney 
1168aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1169aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1170aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1171aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1172aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1173aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1174aa372e3fSPaul Mullowney 
1175aa372e3fSPaul Mullowney   /* Create the matrix description */
117657d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
117757d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
117857d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
117957d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
118057d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1181aa372e3fSPaul Mullowney 
1182aa372e3fSPaul Mullowney   /* set the operation */
1183aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1184aa372e3fSPaul Mullowney 
1185aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1186aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1187afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1188afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1189aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1190afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1191afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1192afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1193aa372e3fSPaul Mullowney 
1194aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1195afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1196afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1197afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1198afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1199afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1200afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1201afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1202afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1203afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1204afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1205afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1206afb2bd1cSJunchao Zhang #endif
1207afb2bd1cSJunchao Zhang 
1208da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1209aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1210aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1211aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1212aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1213aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1214aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1215afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1216afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1217afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1218d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1219afb2bd1cSJunchao Zhang                         #else
1220afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1221d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1222afb2bd1cSJunchao Zhang                         #endif
1223d49cd2b7SBarry Smith 
1224da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1225da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1226aa372e3fSPaul Mullowney 
1227afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1228da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1229afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
12301b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1231afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1232afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1233afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1234afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1235afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1236afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1237afb2bd1cSJunchao Zhang   #endif
1238afb2bd1cSJunchao Zhang 
1239afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1240aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1241afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1242afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1243d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
12441b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1245d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1246d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1247d49cd2b7SBarry Smith                           #else
1248d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1249afb2bd1cSJunchao Zhang                           #endif
1250d49cd2b7SBarry Smith 
1251da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1252da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1253aa372e3fSPaul Mullowney 
1254da79fbbcSStefano Zampini   /* assign the pointer */
1255aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1256bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1257bda325fcSPaul Mullowney }
1258bda325fcSPaul Mullowney 
1259a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1260a49f1ed0SStefano Zampini {
1261a49f1ed0SStefano Zampini   __host__ __device__
1262a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1263a49f1ed0SStefano Zampini   {
1264a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1265a49f1ed0SStefano Zampini   }
1266a49f1ed0SStefano Zampini };
1267a49f1ed0SStefano Zampini 
12683606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1269bda325fcSPaul Mullowney {
1270aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1271a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1272bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1273bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1274aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1275b06137fdSPaul Mullowney   cudaError_t                  err;
127685ba7357SStefano Zampini   PetscErrorCode               ierr;
1277b175d8bbSPaul Mullowney 
1278bda325fcSPaul Mullowney   PetscFunctionBegin;
1279a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1280a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
12812c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1282a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
12832c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->transupdated && !matstructT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12841a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
128585ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1286ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1287a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1288a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1289a49f1ed0SStefano Zampini   }
1290a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1291aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
129257d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1293aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
129457d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
129557d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1296aa372e3fSPaul Mullowney 
1297b06137fdSPaul Mullowney     /* set alpha and beta */
1298afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12997656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
13007656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1301afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13027656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13037656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1304b06137fdSPaul Mullowney 
1305aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1306aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1307a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1308554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1309554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1310aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1311a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1312aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1313aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1314a3fdcf43SKarl Rupp 
1315039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
131681902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1317afb2bd1cSJunchao Zhang 
1318afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13193606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1320afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1321afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1322afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1323afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1324afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1325afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13263606e59fSJunchao Zhang       #else
13273606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
13283606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
13293606e59fSJunchao Zhang 
13303606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
13313606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
13323606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
13333606e59fSJunchao Zhang         */
13343606e59fSJunchao Zhang         if (matrixT->num_entries) {
13353606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
13363606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
13373606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
13383606e59fSJunchao Zhang                                  matrixT->values->data().get(),
13393606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
13403606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13413606e59fSJunchao Zhang 
13423606e59fSJunchao Zhang         } else {
13433606e59fSJunchao Zhang           matstructT->matDescr = NULL;
13443606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13453606e59fSJunchao Zhang         }
13463606e59fSJunchao Zhang       #endif
1347afb2bd1cSJunchao Zhang      #endif
1348aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1349afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1350afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1351afb2bd1cSJunchao Zhang    #else
1352aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
135351c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
135451c6d536SStefano Zampini       /* First convert HYB to CSR */
1355aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1356aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1357aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1358aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1359aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1360aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1361aa372e3fSPaul Mullowney 
1362aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1363aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1364aa372e3fSPaul Mullowney                               temp->values->data().get(),
1365aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
136657d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1367aa372e3fSPaul Mullowney 
1368aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1369aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1370aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1371aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1372aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1373aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1374aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1375aa372e3fSPaul Mullowney 
1376aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1377aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1378aa372e3fSPaul Mullowney                               temp->values->data().get(),
1379aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1380aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1381aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1382aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1383aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
138457d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1385aa372e3fSPaul Mullowney 
1386aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1387aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
138857d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1389aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1390aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1391aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1392aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1393aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1394aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
139557d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1396aa372e3fSPaul Mullowney 
1397aa372e3fSPaul Mullowney       /* assign the pointer */
1398aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13991a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1400aa372e3fSPaul Mullowney       /* delete temporaries */
1401aa372e3fSPaul Mullowney       if (tempT) {
1402aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1403aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1404aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1405aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1406087f3262SPaul Mullowney       }
1407aa372e3fSPaul Mullowney       if (temp) {
1408aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1409aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1410aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1411aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1412aa372e3fSPaul Mullowney       }
1413afb2bd1cSJunchao Zhang      #endif
1414aa372e3fSPaul Mullowney     }
1415a49f1ed0SStefano Zampini   }
1416a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1417a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1418a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
14192c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrix,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
14202c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrix->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
14212c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrix->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
14222c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
14232c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrixT,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
14242c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrixT->row_offsets,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
14252c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrixT->column_indices,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
14262c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matrixT->values,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1427a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1428a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1429a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1430a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1431a49f1ed0SStefano Zampini     }
1432a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1433a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1434a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1435a49f1ed0SStefano Zampini 
1436a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1437a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1438a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1439a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1440a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1441a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1442a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1443a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1444a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1445a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1446a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1447a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1448a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1449a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1450a49f1ed0SStefano Zampini      #endif
1451a49f1ed0SStefano Zampini 
14521a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
14531a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
14541a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
14551a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
14561a2c6b5cSJunchao Zhang 
14571a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
14581a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
14591a2c6b5cSJunchao Zhang         */
14601a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14611a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14621a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14631a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14641a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1465a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1466a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1467a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1468a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14691a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1470a49f1ed0SStefano Zampini                              #else
1471a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14721a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1473a49f1ed0SStefano Zampini                              #endif
14741a2c6b5cSJunchao Zhang       } else {
14751a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14761a2c6b5cSJunchao Zhang       }
14771a2c6b5cSJunchao Zhang 
1478a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1479a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1480a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1481a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1482a49f1ed0SStefano Zampini      #endif
1483a49f1ed0SStefano Zampini     }
1484a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1485a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1486a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1487a49f1ed0SStefano Zampini   }
1488ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
148985ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1490213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1491213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1492aa372e3fSPaul Mullowney   /* assign the pointer */
1493aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14941a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1495bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1496bda325fcSPaul Mullowney }
1497bda325fcSPaul Mullowney 
1498a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14996fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1500bda325fcSPaul Mullowney {
1501c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1502465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1503465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1504465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1505465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1506bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1507bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1508aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1509aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1510aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1511b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1512bda325fcSPaul Mullowney 
1513bda325fcSPaul Mullowney   PetscFunctionBegin;
1514aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1515aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1516bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1517aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1518aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1519bda325fcSPaul Mullowney   }
1520bda325fcSPaul Mullowney 
1521bda325fcSPaul Mullowney   /* Get the GPU pointers */
1522c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1523c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1524c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1525c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1526bda325fcSPaul Mullowney 
15277a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1528aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1529a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1530c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1531c41cb2e2SAlejandro Lamas Daviña                xGPU);
1532aa372e3fSPaul Mullowney 
1533aa372e3fSPaul Mullowney   /* First, solve U */
1534aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1535afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15361b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1538afb2bd1cSJunchao Zhang                       #endif
1539afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1540aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1541aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1542aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1543aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1544d49cd2b7SBarry Smith                         xarray,
15451b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546d49cd2b7SBarry Smith                         tempGPU->data().get(),
1547d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1548d49cd2b7SBarry Smith                       #else
1549d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1550afb2bd1cSJunchao Zhang                       #endif
1551aa372e3fSPaul Mullowney 
1552aa372e3fSPaul Mullowney   /* Then, solve L */
1553aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1554afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15551b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1556afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1557afb2bd1cSJunchao Zhang                       #endif
1558afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1559aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1560aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1561aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1562aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1563d49cd2b7SBarry Smith                         tempGPU->data().get(),
15641b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1565d49cd2b7SBarry Smith                         xarray,
1566d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1567d49cd2b7SBarry Smith                       #else
1568d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1569afb2bd1cSJunchao Zhang                       #endif
1570aa372e3fSPaul Mullowney 
1571aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1572a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1573c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1574aa372e3fSPaul Mullowney                tempGPU->begin());
1575aa372e3fSPaul Mullowney 
1576aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1577a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1578bda325fcSPaul Mullowney 
1579bda325fcSPaul Mullowney   /* restore */
1580c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1581c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1582661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1583958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1584bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1585bda325fcSPaul Mullowney }
1586bda325fcSPaul Mullowney 
15876fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1588bda325fcSPaul Mullowney {
1589465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1590465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1591bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1592bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1593aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1594aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1595aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1596b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1597bda325fcSPaul Mullowney 
1598bda325fcSPaul Mullowney   PetscFunctionBegin;
1599aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1600aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1601bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1602aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1603aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1604bda325fcSPaul Mullowney   }
1605bda325fcSPaul Mullowney 
1606bda325fcSPaul Mullowney   /* Get the GPU pointers */
1607c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1608c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1609bda325fcSPaul Mullowney 
16107a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1611aa372e3fSPaul Mullowney   /* First, solve U */
1612aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1613afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
16141b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1615afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1616afb2bd1cSJunchao Zhang                       #endif
1617afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1618aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1619aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1620aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1621aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1622d49cd2b7SBarry Smith                         barray,
16231b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624d49cd2b7SBarry Smith                         tempGPU->data().get(),
1625d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1626d49cd2b7SBarry Smith                       #else
1627d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1628afb2bd1cSJunchao Zhang                       #endif
1629aa372e3fSPaul Mullowney 
1630aa372e3fSPaul Mullowney   /* Then, solve L */
1631aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1632afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
16331b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1634afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1635afb2bd1cSJunchao Zhang                       #endif
1636afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1637aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1638aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1639aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1640aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1641d49cd2b7SBarry Smith                         tempGPU->data().get(),
16421b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1643d49cd2b7SBarry Smith                         xarray,
1644d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1645d49cd2b7SBarry Smith                       #else
1646d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1647afb2bd1cSJunchao Zhang                       #endif
1648bda325fcSPaul Mullowney 
1649bda325fcSPaul Mullowney   /* restore */
1650c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1651c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1652661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1653958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1654bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1655bda325fcSPaul Mullowney }
1656bda325fcSPaul Mullowney 
16576fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
16589ae82921SPaul Mullowney {
1659465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1660465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1661465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1662465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16639ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1665aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1666aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1667aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1668b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
16699ae82921SPaul Mullowney 
16709ae82921SPaul Mullowney   PetscFunctionBegin;
1671ebc8f436SDominic Meiser 
1672e057df02SPaul Mullowney   /* Get the GPU pointers */
1673c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1674c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1675c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1676c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16779ae82921SPaul Mullowney 
16787a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1679aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1680a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1681c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16824e4bbfaaSStefano Zampini                tempGPU->begin());
1683aa372e3fSPaul Mullowney 
1684aa372e3fSPaul Mullowney   /* Next, solve L */
1685aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1686afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16871b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1688afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1689afb2bd1cSJunchao Zhang                       #endif
1690afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1691aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1692aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1693aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1694aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1695d49cd2b7SBarry Smith                         tempGPU->data().get(),
16961b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1697d49cd2b7SBarry Smith                          xarray,
1698d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1699d49cd2b7SBarry Smith                       #else
1700d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1701afb2bd1cSJunchao Zhang                       #endif
1702aa372e3fSPaul Mullowney 
1703aa372e3fSPaul Mullowney   /* Then, solve U */
1704aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1705afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17061b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1707afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1708afb2bd1cSJunchao Zhang                       #endif
1709afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1710aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1711aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1712aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1713d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
17141b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1715d49cd2b7SBarry Smith                         tempGPU->data().get(),
1716d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1717d49cd2b7SBarry Smith                       #else
1718d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1719afb2bd1cSJunchao Zhang                       #endif
1720d49cd2b7SBarry Smith 
17214e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1722a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
17234e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
17244e4bbfaaSStefano Zampini                xGPU);
17259ae82921SPaul Mullowney 
1726c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1727c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1728661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1729958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17309ae82921SPaul Mullowney   PetscFunctionReturn(0);
17319ae82921SPaul Mullowney }
17329ae82921SPaul Mullowney 
17336fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
17349ae82921SPaul Mullowney {
1735465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1736465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
17379ae82921SPaul Mullowney   cusparseStatus_t                  stat;
17389ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1739aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1740aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1741aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1742b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
17439ae82921SPaul Mullowney 
17449ae82921SPaul Mullowney   PetscFunctionBegin;
1745e057df02SPaul Mullowney   /* Get the GPU pointers */
1746c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1747c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
17489ae82921SPaul Mullowney 
17497a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1750aa372e3fSPaul Mullowney   /* First, solve L */
1751aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1752afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
17531b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1754afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1755afb2bd1cSJunchao Zhang                       #endif
1756afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1757aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1758aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1759aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1760aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1761d49cd2b7SBarry Smith                         barray,
17621b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1763d49cd2b7SBarry Smith                         tempGPU->data().get(),
1764d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1765d49cd2b7SBarry Smith                       #else
1766d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1767afb2bd1cSJunchao Zhang                       #endif
1768d49cd2b7SBarry Smith 
1769aa372e3fSPaul Mullowney   /* Next, solve U */
1770aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1771afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17721b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1773afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1774afb2bd1cSJunchao Zhang                       #endif
1775afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1776aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1777aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1778aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1779aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1780d49cd2b7SBarry Smith                         tempGPU->data().get(),
17811b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1782d49cd2b7SBarry Smith                         xarray,
1783d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1784d49cd2b7SBarry Smith                       #else
1785d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1786afb2bd1cSJunchao Zhang                       #endif
17879ae82921SPaul Mullowney 
1788c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1789c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1790661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1791958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17929ae82921SPaul Mullowney   PetscFunctionReturn(0);
17939ae82921SPaul Mullowney }
17949ae82921SPaul Mullowney 
17957e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17967e8381f9SStefano Zampini {
17977e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17987e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17997e8381f9SStefano Zampini   cudaError_t        cerr;
18007e8381f9SStefano Zampini   PetscErrorCode     ierr;
18017e8381f9SStefano Zampini 
18027e8381f9SStefano Zampini   PetscFunctionBegin;
18037e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
18047e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
18057e8381f9SStefano Zampini 
18067e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18077e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
18087e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
18097e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
18107e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18117e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
18127e8381f9SStefano Zampini   }
18137e8381f9SStefano Zampini   PetscFunctionReturn(0);
18147e8381f9SStefano Zampini }
18157e8381f9SStefano Zampini 
18167e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
18177e8381f9SStefano Zampini {
18187e8381f9SStefano Zampini   PetscErrorCode ierr;
18197e8381f9SStefano Zampini 
18207e8381f9SStefano Zampini   PetscFunctionBegin;
18217e8381f9SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
182267a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
182367a45760SJunchao Zhang   PetscFunctionReturn(0);
182467a45760SJunchao Zhang }
182567a45760SJunchao Zhang 
182667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
182767a45760SJunchao Zhang {
182867a45760SJunchao Zhang   PetscFunctionBegin;
18297e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
183067a45760SJunchao Zhang   *array         = NULL;
183167a45760SJunchao Zhang   PetscFunctionReturn(0);
183267a45760SJunchao Zhang }
183367a45760SJunchao Zhang 
183467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
183567a45760SJunchao Zhang {
183667a45760SJunchao Zhang   PetscErrorCode ierr;
183767a45760SJunchao Zhang 
183867a45760SJunchao Zhang   PetscFunctionBegin;
183967a45760SJunchao Zhang   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
184067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
184167a45760SJunchao Zhang   PetscFunctionReturn(0);
184267a45760SJunchao Zhang }
184367a45760SJunchao Zhang 
184467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
184567a45760SJunchao Zhang {
184667a45760SJunchao Zhang   PetscFunctionBegin;
184767a45760SJunchao Zhang   *array = NULL;
184867a45760SJunchao Zhang   PetscFunctionReturn(0);
184967a45760SJunchao Zhang }
185067a45760SJunchao Zhang 
185167a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185267a45760SJunchao Zhang {
185367a45760SJunchao Zhang   PetscFunctionBegin;
185467a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
185567a45760SJunchao Zhang   PetscFunctionReturn(0);
185667a45760SJunchao Zhang }
185767a45760SJunchao Zhang 
185867a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185967a45760SJunchao Zhang {
186067a45760SJunchao Zhang   PetscFunctionBegin;
186167a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
186267a45760SJunchao Zhang   *array         = NULL;
18637e8381f9SStefano Zampini   PetscFunctionReturn(0);
18647e8381f9SStefano Zampini }
18657e8381f9SStefano Zampini 
1866042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18679ae82921SPaul Mullowney {
1868aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18697c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18709ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1871213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
18729ae82921SPaul Mullowney   PetscErrorCode               ierr;
1873aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1874abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1875b06137fdSPaul Mullowney   cudaError_t                  err;
18769ae82921SPaul Mullowney 
18779ae82921SPaul Mullowney   PetscFunctionBegin;
18782c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->boundtocpu,PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1879c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1880a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1881a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1882afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
188385ba7357SStefano Zampini 
18842c71b3e2SJacob Faibussowitsch       PetscCheckFalse(a->nz && !a->a,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
188585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1886afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
188705035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
18884863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
188985ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1890a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
189134d6c7a5SJose E. Roman     } else {
1892abb89eb1SStefano Zampini       PetscInt nnz;
189385ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
18947c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1895a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
18967c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
189781902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1898a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1899a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
19009ae82921SPaul Mullowney       try {
19019ae82921SPaul Mullowney         if (a->compressedrow.use) {
19029ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
19039ae82921SPaul Mullowney           ii   = a->compressedrow.i;
19049ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
19059ae82921SPaul Mullowney         } else {
1906213423ffSJunchao Zhang           m    = A->rmap->n;
1907213423ffSJunchao Zhang           ii   = a->i;
1908e6e9a74fSStefano Zampini           ridx = NULL;
19099ae82921SPaul Mullowney         }
19102c71b3e2SJacob Faibussowitsch         PetscCheckFalse(!ii,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
19112c71b3e2SJacob Faibussowitsch         PetscCheckFalse(m && !a->j,PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1912abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1913abb89eb1SStefano Zampini         else nnz = a->nz;
19149ae82921SPaul Mullowney 
191585ba7357SStefano Zampini         /* create cusparse matrix */
1916abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1917aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
191857d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
191957d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
192057d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
19219ae82921SPaul Mullowney 
1922afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
19237656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
19247656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1925afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19267656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19277656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
192857d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1929b06137fdSPaul Mullowney 
1930aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1931aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1932aa372e3fSPaul Mullowney           /* set the matrix */
1933afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1934afb2bd1cSJunchao Zhang           mat->num_rows = m;
1935afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1936abb89eb1SStefano Zampini           mat->num_entries = nnz;
1937afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1938afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
19399ae82921SPaul Mullowney 
1940abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1941abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1942aa372e3fSPaul Mullowney 
1943abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1944abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1945aa372e3fSPaul Mullowney 
1946aa372e3fSPaul Mullowney           /* assign the pointer */
1947afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1948afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1949afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1950afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1951afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1952afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1953afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1954afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1955afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1956afb2bd1cSJunchao Zhang           }
1957afb2bd1cSJunchao Zhang          #endif
1958aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1959afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1960afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1961afb2bd1cSJunchao Zhang          #else
1962afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1963afb2bd1cSJunchao Zhang           mat->num_rows = m;
1964afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1965abb89eb1SStefano Zampini           mat->num_entries = nnz;
1966afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1967afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1968aa372e3fSPaul Mullowney 
1969abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1970abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1971aa372e3fSPaul Mullowney 
1972abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1973abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1974aa372e3fSPaul Mullowney 
1975aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
197657d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1977aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1978aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1979afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1980afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1981afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1982afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
198357d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1984aa372e3fSPaul Mullowney           /* assign the pointer */
1985aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1986aa372e3fSPaul Mullowney 
1987afb2bd1cSJunchao Zhang           if (mat) {
1988afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1989afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1990afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1991afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1992087f3262SPaul Mullowney           }
1993afb2bd1cSJunchao Zhang          #endif
1994087f3262SPaul Mullowney         }
1995ca45077fSPaul Mullowney 
1996aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1997213423ffSJunchao Zhang         if (a->compressedrow.use) {
1998213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1999aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
2000aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
2001213423ffSJunchao Zhang           tmp = m;
2002213423ffSJunchao Zhang         } else {
2003213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2004213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2005213423ffSJunchao Zhang           tmp = 0;
2006213423ffSJunchao Zhang         }
2007213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2008aa372e3fSPaul Mullowney 
2009aa372e3fSPaul Mullowney         /* assign the pointer */
2010aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
20119ae82921SPaul Mullowney       } catch(char *ex) {
201298921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
20139ae82921SPaul Mullowney       }
201405035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
201585ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
201634d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
201734d6c7a5SJose E. Roman     }
2018abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
20199ae82921SPaul Mullowney   }
20209ae82921SPaul Mullowney   PetscFunctionReturn(0);
20219ae82921SPaul Mullowney }
20229ae82921SPaul Mullowney 
2023c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
2024aa372e3fSPaul Mullowney {
2025aa372e3fSPaul Mullowney   template <typename Tuple>
2026aa372e3fSPaul Mullowney   __host__ __device__
2027aa372e3fSPaul Mullowney   void operator()(Tuple t)
2028aa372e3fSPaul Mullowney   {
2029aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2030aa372e3fSPaul Mullowney   }
2031aa372e3fSPaul Mullowney };
2032aa372e3fSPaul Mullowney 
20337e8381f9SStefano Zampini struct VecCUDAEquals
20347e8381f9SStefano Zampini {
20357e8381f9SStefano Zampini   template <typename Tuple>
20367e8381f9SStefano Zampini   __host__ __device__
20377e8381f9SStefano Zampini   void operator()(Tuple t)
20387e8381f9SStefano Zampini   {
20397e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
20407e8381f9SStefano Zampini   }
20417e8381f9SStefano Zampini };
20427e8381f9SStefano Zampini 
2043e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
2044e6e9a74fSStefano Zampini {
2045e6e9a74fSStefano Zampini   template <typename Tuple>
2046e6e9a74fSStefano Zampini   __host__ __device__
2047e6e9a74fSStefano Zampini   void operator()(Tuple t)
2048e6e9a74fSStefano Zampini   {
2049e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2050e6e9a74fSStefano Zampini   }
2051e6e9a74fSStefano Zampini };
2052e6e9a74fSStefano Zampini 
2053afb2bd1cSJunchao Zhang struct MatMatCusparse {
2054ccdfe979SStefano Zampini   PetscBool             cisdense;
2055ccdfe979SStefano Zampini   PetscScalar           *Bt;
2056ccdfe979SStefano Zampini   Mat                   X;
2057fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2058fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2059fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2060b4285af6SJunchao Zhang 
2061afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2062fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2063afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2064afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2065afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2066afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2067b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2068b4285af6SJunchao Zhang   void                  *dBuffer4;
2069b4285af6SJunchao Zhang   void                  *dBuffer5;
2070b4285af6SJunchao Zhang  #endif
2071fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2072fcdce8c4SStefano Zampini   void                  *mmBuffer;
2073fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2074fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2075afb2bd1cSJunchao Zhang #endif
2076afb2bd1cSJunchao Zhang };
2077ccdfe979SStefano Zampini 
2078ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2079ccdfe979SStefano Zampini {
2080ccdfe979SStefano Zampini   PetscErrorCode   ierr;
2081ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2082ccdfe979SStefano Zampini   cudaError_t      cerr;
2083fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2084fcdce8c4SStefano Zampini   cusparseStatus_t stat;
2085fcdce8c4SStefano Zampini  #endif
2086ccdfe979SStefano Zampini 
2087ccdfe979SStefano Zampini   PetscFunctionBegin;
2088ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2089fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2090afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2091fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2092afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2093afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2094fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2095b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2096b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2097b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2098b4285af6SJunchao Zhang  #endif
2099b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2100b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2101afb2bd1cSJunchao Zhang  #endif
2102ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2103ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
2104ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2105ccdfe979SStefano Zampini }
2106ccdfe979SStefano Zampini 
2107ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2108ccdfe979SStefano Zampini 
2109ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2110ccdfe979SStefano Zampini {
2111ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2112ccdfe979SStefano Zampini   Mat                          A,B;
2113afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2114ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2115ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2116ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2117ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2118ccdfe979SStefano Zampini   const PetscScalar            *barray;
2119ccdfe979SStefano Zampini   PetscScalar                  *carray;
2120ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2121ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2122ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2123ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2124ccdfe979SStefano Zampini 
2125ccdfe979SStefano Zampini   PetscFunctionBegin;
2126ccdfe979SStefano Zampini   MatCheckProduct(C,1);
21272c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2128ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2129ccdfe979SStefano Zampini   A    = product->A;
2130ccdfe979SStefano Zampini   B    = product->B;
2131ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
21322c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2133ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2134ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
21352c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2136ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2137ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2138ccdfe979SStefano Zampini   switch (product->type) {
2139ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2140ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2141ccdfe979SStefano Zampini     mat = cusp->mat;
2142ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2143ccdfe979SStefano Zampini     m   = A->rmap->n;
2144ccdfe979SStefano Zampini     n   = B->cmap->n;
2145ccdfe979SStefano Zampini     break;
2146ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
21471a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2148e6e9a74fSStefano Zampini       mat = cusp->mat;
2149e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2150e6e9a74fSStefano Zampini     } else {
21513606e59fSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2152ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2153ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2154e6e9a74fSStefano Zampini     }
2155ccdfe979SStefano Zampini     m = A->cmap->n;
2156ccdfe979SStefano Zampini     n = B->cmap->n;
2157ccdfe979SStefano Zampini     break;
2158ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2159ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2160ccdfe979SStefano Zampini     mat = cusp->mat;
2161ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2162ccdfe979SStefano Zampini     m   = A->rmap->n;
2163ccdfe979SStefano Zampini     n   = B->rmap->n;
2164ccdfe979SStefano Zampini     break;
2165ccdfe979SStefano Zampini   default:
216698921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2167ccdfe979SStefano Zampini   }
21682c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!mat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2169ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2170ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2171ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2172afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2173ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2174afb2bd1cSJunchao Zhang 
2175ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2176c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2177c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2178c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2179c8378d12SStefano Zampini   } else {
2180c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2181c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2182c8378d12SStefano Zampini   }
2183c8378d12SStefano Zampini 
2184c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2185afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2186afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2187a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2188afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2189fcdce8c4SStefano Zampini     size_t mmBufferSize;
2190afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2191afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2192afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2193afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2194afb2bd1cSJunchao Zhang     }
2195c8378d12SStefano Zampini 
2196afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2197afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2198afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2199afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2200afb2bd1cSJunchao Zhang     }
2201afb2bd1cSJunchao Zhang 
2202afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2203afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2204afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2205afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2206afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2207afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2208afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2209afb2bd1cSJunchao Zhang     }
2210afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2211afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2212afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2213fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2214fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2215ee7b52eaSHong Zhang       cudaError_t cerr;
2216fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2217fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2218fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2219fcdce8c4SStefano Zampini     }
2220afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2221afb2bd1cSJunchao Zhang   } else {
2222afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2223afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2224afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2225afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2226afb2bd1cSJunchao Zhang   }
2227afb2bd1cSJunchao Zhang 
2228afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2229afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2230afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2231afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2232fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2233afb2bd1cSJunchao Zhang  #else
2234afb2bd1cSJunchao Zhang   PetscInt k;
2235afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2236ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2237ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2238ccdfe979SStefano Zampini     cublasStatus_t cerr;
2239ccdfe979SStefano Zampini 
2240ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2241ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2242ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2243ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2244ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2245ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2246ccdfe979SStefano Zampini     blda = B->cmap->n;
2247afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2248afb2bd1cSJunchao Zhang   } else {
2249afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2250ccdfe979SStefano Zampini   }
2251ccdfe979SStefano Zampini 
2252afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2253ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2254afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2255ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2256ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2257ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2258ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2259ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2260afb2bd1cSJunchao Zhang  #endif
2261c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2262c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2263ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2264ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2265ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2266ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2267ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2268ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2269ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2270ccdfe979SStefano Zampini   } else {
2271ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2272ccdfe979SStefano Zampini   }
2273ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2274ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2275ccdfe979SStefano Zampini   }
2276ccdfe979SStefano Zampini   if (!biscuda) {
2277ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2278ccdfe979SStefano Zampini   }
2279ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2280ccdfe979SStefano Zampini }
2281ccdfe979SStefano Zampini 
2282ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2283ccdfe979SStefano Zampini {
2284ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2285ccdfe979SStefano Zampini   Mat                A,B;
2286ccdfe979SStefano Zampini   PetscInt           m,n;
2287ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2288ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2289ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2290ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2291ccdfe979SStefano Zampini 
2292ccdfe979SStefano Zampini   PetscFunctionBegin;
2293ccdfe979SStefano Zampini   MatCheckProduct(C,1);
22942c71b3e2SJacob Faibussowitsch   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2295ccdfe979SStefano Zampini   A    = product->A;
2296ccdfe979SStefano Zampini   B    = product->B;
2297ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
22982c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2299ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
23002c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2301ccdfe979SStefano Zampini   switch (product->type) {
2302ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2303ccdfe979SStefano Zampini     m = A->rmap->n;
2304ccdfe979SStefano Zampini     n = B->cmap->n;
2305ccdfe979SStefano Zampini     break;
2306ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2307ccdfe979SStefano Zampini     m = A->cmap->n;
2308ccdfe979SStefano Zampini     n = B->cmap->n;
2309ccdfe979SStefano Zampini     break;
2310ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2311ccdfe979SStefano Zampini     m = A->rmap->n;
2312ccdfe979SStefano Zampini     n = B->rmap->n;
2313ccdfe979SStefano Zampini     break;
2314ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2315ccdfe979SStefano Zampini     m = B->cmap->n;
2316ccdfe979SStefano Zampini     n = B->cmap->n;
2317ccdfe979SStefano Zampini     break;
2318ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2319ccdfe979SStefano Zampini     m = B->rmap->n;
2320ccdfe979SStefano Zampini     n = B->rmap->n;
2321ccdfe979SStefano Zampini     break;
2322ccdfe979SStefano Zampini   default:
232398921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2324ccdfe979SStefano Zampini   }
2325ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2326ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2327ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2328ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2329ccdfe979SStefano Zampini 
2330ccdfe979SStefano Zampini   /* product data */
2331ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2332ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2333afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2334afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2335ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2336afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2337ccdfe979SStefano Zampini   }
2338afb2bd1cSJunchao Zhang  #endif
2339ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2340ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2341ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2342ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2343ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2344ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2345ccdfe979SStefano Zampini     } else {
2346ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2347ccdfe979SStefano Zampini     }
2348ccdfe979SStefano Zampini   }
2349ccdfe979SStefano Zampini   C->product->data    = mmdata;
2350ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2351ccdfe979SStefano Zampini 
2352ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2353ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2354ccdfe979SStefano Zampini }
2355ccdfe979SStefano Zampini 
2356fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2357ccdfe979SStefano Zampini {
2358ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2359fcdce8c4SStefano Zampini   Mat                          A,B;
2360fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2361fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2362fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2363fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2364fcdce8c4SStefano Zampini   PetscBool                    flg;
2365ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2366fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2367fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2368fcdce8c4SStefano Zampini   MatProductType               ptype;
2369fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2370fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2371fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2372fcdce8c4SStefano Zampini #endif
2373b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2374ccdfe979SStefano Zampini 
2375ccdfe979SStefano Zampini   PetscFunctionBegin;
2376ccdfe979SStefano Zampini   MatCheckProduct(C,1);
23772c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2378fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
23792c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2380fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2381fcdce8c4SStefano Zampini   A = product->A;
2382fcdce8c4SStefano Zampini   B = product->B;
2383fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2384fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2385fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
23862c71b3e2SJacob Faibussowitsch     PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2387fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
23882c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2389fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
23902c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2391fcdce8c4SStefano Zampini     goto finalize;
2392fcdce8c4SStefano Zampini   }
2393fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2394fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
23952c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2396fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
23972c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
23982c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
23992c71b3e2SJacob Faibussowitsch   PetscCheckFalse(B->boundtocpu,PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2400fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2401fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2402fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
24032c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
24042c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
24052c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Ccusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2406fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2407fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2408fcdce8c4SStefano Zampini 
2409fcdce8c4SStefano Zampini   ptype = product->type;
2410fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2411fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
24122c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!product->symbolic_used_the_fact_A_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2413fa046f9fSJunchao Zhang   }
2414fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2415fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
24162c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!product->symbolic_used_the_fact_B_is_symmetric,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2417fa046f9fSJunchao Zhang   }
2418fcdce8c4SStefano Zampini   switch (ptype) {
2419fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2420fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2421fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2422fcdce8c4SStefano Zampini     break;
2423fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2424fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2425fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2426fcdce8c4SStefano Zampini     break;
2427fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2428fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2429fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2430fcdce8c4SStefano Zampini     break;
2431fcdce8c4SStefano Zampini   default:
243298921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2433fcdce8c4SStefano Zampini   }
2434fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
24352c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
24362c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
24372c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Cmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2438fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2439fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2440fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
24412c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
24422c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
24432c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Ccsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2444fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2445fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2446fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2447b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2448b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2449b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2450b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2451b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2452b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2453b4285af6SJunchao Zhang   #else
2454b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2455fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2456fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2457fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2458b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2459fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2460fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2461b4285af6SJunchao Zhang   #endif
2462fcdce8c4SStefano Zampini #else
2463b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2464fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2465fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2466fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2467fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2468fcdce8c4SStefano Zampini #endif
2469fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2470fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2471fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2472fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2473fcdce8c4SStefano Zampini finalize:
2474fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
24757d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2476fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
24777d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2478fcdce8c4SStefano Zampini   c->reallocs         = 0;
2479fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2480fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2481fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2482fcdce8c4SStefano Zampini   C->num_ass++;
2483ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2484ccdfe979SStefano Zampini }
2485fcdce8c4SStefano Zampini 
2486fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2487fcdce8c4SStefano Zampini {
2488fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2489fcdce8c4SStefano Zampini   Mat                          A,B;
2490fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2491fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2492fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2493fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2494fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2495fcdce8c4SStefano Zampini   PetscBool                    flg;
2496fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2497fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2498fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2499fcdce8c4SStefano Zampini   MatProductType               ptype;
2500fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2501fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2502fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2503fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2504fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2505fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2506fcdce8c4SStefano Zampini #else
2507fcdce8c4SStefano Zampini   int                          cnz;
2508fcdce8c4SStefano Zampini #endif
2509b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2510fcdce8c4SStefano Zampini 
2511fcdce8c4SStefano Zampini   PetscFunctionBegin;
2512fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
25132c71b3e2SJacob Faibussowitsch   PetscCheckFalse(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2514fcdce8c4SStefano Zampini   A    = product->A;
2515fcdce8c4SStefano Zampini   B    = product->B;
2516fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
25172c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2518fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
25192c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!flg,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2520fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2521fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2522fcdce8c4SStefano Zampini   /* product data */
2523fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2524fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2525fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2526fcdce8c4SStefano Zampini 
2527fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2528fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2529d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2530d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
25312c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
25322c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2533d60bce21SJunchao Zhang 
2534fcdce8c4SStefano Zampini   ptype = product->type;
2535fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2536fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2537fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2538fa046f9fSJunchao Zhang   }
2539fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2540fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2541fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2542fa046f9fSJunchao Zhang   }
2543fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2544fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2545fcdce8c4SStefano Zampini   switch (ptype) {
2546fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2547fcdce8c4SStefano Zampini     m = A->rmap->n;
2548fcdce8c4SStefano Zampini     n = B->cmap->n;
2549fcdce8c4SStefano Zampini     k = A->cmap->n;
2550fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2551fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2552fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2553fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2554fcdce8c4SStefano Zampini     break;
2555fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2556fcdce8c4SStefano Zampini     m = A->cmap->n;
2557fcdce8c4SStefano Zampini     n = B->cmap->n;
2558fcdce8c4SStefano Zampini     k = A->rmap->n;
25593606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2560fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2561fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2562fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2563fcdce8c4SStefano Zampini     break;
2564fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2565fcdce8c4SStefano Zampini     m = A->rmap->n;
2566fcdce8c4SStefano Zampini     n = B->rmap->n;
2567fcdce8c4SStefano Zampini     k = A->cmap->n;
25683606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2569fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2570fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2571fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2572fcdce8c4SStefano Zampini     break;
2573fcdce8c4SStefano Zampini   default:
257498921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2575fcdce8c4SStefano Zampini   }
2576fcdce8c4SStefano Zampini 
2577fcdce8c4SStefano Zampini   /* create cusparse matrix */
2578fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2579fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2580fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2581fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2582fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2583fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2584fcdce8c4SStefano Zampini 
2585fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2586fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2587fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2588fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2589fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2590fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2591fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2592fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2593fcdce8c4SStefano Zampini   } else {
2594fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2595fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2596fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2597fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2598fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2599fcdce8c4SStefano Zampini   }
2600fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2601fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2602fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2603fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2604fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2605fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2606fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2607fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2608fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2609fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2610fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2611fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2612fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2613fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2614fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2615fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2616fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2617fcdce8c4SStefano Zampini     c->nz = 0;
2618fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2619fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2620fcdce8c4SStefano Zampini     goto finalizesym;
2621fcdce8c4SStefano Zampini   }
2622fcdce8c4SStefano Zampini 
26232c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Amat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
26242c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Bmat,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2625fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2626fcdce8c4SStefano Zampini   if (!biscompressed) {
2627fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2628fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2629fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2630fcdce8c4SStefano Zampini #endif
2631fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2632fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2633fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2634fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2635fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2636fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2637fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2638fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2639fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2640fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2641fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2642fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2643fcdce8c4SStefano Zampini     }
2644fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2645fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2646fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2647fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2648fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2649fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2650fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2651fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2652fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2653fcdce8c4SStefano Zampini     }
2654fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2655fcdce8c4SStefano Zampini #endif
2656fcdce8c4SStefano Zampini   }
26572c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Acsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
26582c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!Bcsr,PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2659fcdce8c4SStefano Zampini   /* precompute flops count */
2660fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2661fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2662fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2663fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2664fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2665fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2666fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2667fcdce8c4SStefano Zampini       }
2668fcdce8c4SStefano Zampini     }
2669fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2670fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2671fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2672fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2673fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2674fcdce8c4SStefano Zampini     }
2675fcdce8c4SStefano Zampini   } else { /* TODO */
2676fcdce8c4SStefano Zampini     flops = 0.;
2677fcdce8c4SStefano Zampini   }
2678fcdce8c4SStefano Zampini 
2679fcdce8c4SStefano Zampini   mmdata->flops = flops;
2680fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2681b4285af6SJunchao Zhang 
2682fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2683fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2684fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2685fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2686fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2687fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2688fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2689b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2690b4285af6SJunchao Zhang  {
2691b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2692b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2693b4285af6SJunchao Zhang   */
2694b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2695b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2696b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2697b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2698b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2699b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2700b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2701b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2702b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2703b4285af6SJunchao Zhang 
2704b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2705b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2706b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2707b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2708b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2709b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2710b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2711b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2712b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2713b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2714b4285af6SJunchao Zhang 
2715b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2716b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2717b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2718b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2719b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2720b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2721b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2722b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2723b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2724b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2725b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2726b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2727b4285af6SJunchao Zhang 
2728b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2729b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2730b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2731b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2732b4285af6SJunchao Zhang   /* allocate matrix C */
2733b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2735b4285af6SJunchao Zhang   /* update matC with the new pointers */
2736b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2737b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2738b4285af6SJunchao Zhang 
2739b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2740b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2741b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2742b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2743b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2744b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2745b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2746b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2747b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2748b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2749b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2750b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2751b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
27527d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2753b4285af6SJunchao Zhang  }
2754ae37ee31SJunchao Zhang  #else
2755b4285af6SJunchao Zhang   size_t bufSize2;
2756fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2757b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2758fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2759fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2760fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2761bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2762fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2763b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2764fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2765fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2766fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2767fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2768b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2769fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2770fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2771fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2772fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2773fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2774fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2775fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2776fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2777bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2778fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2779b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2780fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2781fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2782fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2783fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2784fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2785fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
27867d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2787fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2788fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2789fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2790fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2791fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2792fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2793b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2794fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2795fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2796ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2797fcdce8c4SStefano Zampini #else
2798fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2799b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2800fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2801fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2802fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2803fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2804fcdce8c4SStefano Zampini   c->nz = cnz;
2805fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2806fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2807fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2808fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2809fcdce8c4SStefano Zampini 
2810fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2811fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2812fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2813fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2814b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2815fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2816fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2817fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2818fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2819fcdce8c4SStefano Zampini #endif
2820fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2821fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2822fcdce8c4SStefano Zampini finalizesym:
2823fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2824fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2825fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2826fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2827fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2828fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2829fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2830fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2831fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2832fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2833fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2834fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2835fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2836fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2837fcdce8c4SStefano Zampini   } else {
2838fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2839fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2840fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2841fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2842fcdce8c4SStefano Zampini   }
2843fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2844fcdce8c4SStefano Zampini     PetscInt r = 0;
2845fcdce8c4SStefano Zampini     c->i[0] = 0;
2846fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2847fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2848fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2849fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2850fcdce8c4SStefano Zampini     }
2851fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2852fcdce8c4SStefano Zampini   }
2853fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2854fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2855fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2856fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2857fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2858fcdce8c4SStefano Zampini   c->rmax = 0;
2859fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2860fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2861fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2862fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2863fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2864fcdce8c4SStefano Zampini   }
2865fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2866fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2867fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2868fcdce8c4SStefano Zampini 
2869fcdce8c4SStefano Zampini   C->nonzerostate++;
2870fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2871fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2872fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2873fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2874fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2875fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2876fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2877abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2878fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2879fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2880fcdce8c4SStefano Zampini   }
2881fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2882fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2883fcdce8c4SStefano Zampini }
2884fcdce8c4SStefano Zampini 
2885fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2886fcdce8c4SStefano Zampini 
2887fcdce8c4SStefano Zampini /* handles sparse or dense B */
2888fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2889fcdce8c4SStefano Zampini {
2890fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2891fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2892fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2893fcdce8c4SStefano Zampini 
2894fcdce8c4SStefano Zampini   PetscFunctionBegin;
2895fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2896fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2897abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2898fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2899fcdce8c4SStefano Zampini   }
2900fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2901fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2902fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2903fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2904fcdce8c4SStefano Zampini     }
2905fcdce8c4SStefano Zampini   }
290665e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
290765e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
290865e4b4d4SStefano Zampini     switch (product->type) {
290965e4b4d4SStefano Zampini     case MATPRODUCT_AB:
291065e4b4d4SStefano Zampini       if (product->api_user) {
291165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
291265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291465e4b4d4SStefano Zampini       } else {
291565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
29163e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291865e4b4d4SStefano Zampini       }
291965e4b4d4SStefano Zampini       break;
292065e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
292165e4b4d4SStefano Zampini       if (product->api_user) {
292265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
292365e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292565e4b4d4SStefano Zampini       } else {
292665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
29273e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292965e4b4d4SStefano Zampini       }
293065e4b4d4SStefano Zampini       break;
293165e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
293265e4b4d4SStefano Zampini       if (product->api_user) {
293365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
293465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293665e4b4d4SStefano Zampini       } else {
293765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
29383e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294065e4b4d4SStefano Zampini       }
294165e4b4d4SStefano Zampini       break;
294265e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
294365e4b4d4SStefano Zampini       if (product->api_user) {
294465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
294565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294765e4b4d4SStefano Zampini       } else {
294865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
29493e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295165e4b4d4SStefano Zampini       }
295265e4b4d4SStefano Zampini       break;
295365e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
295465e4b4d4SStefano Zampini       if (product->api_user) {
295565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
295665e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295865e4b4d4SStefano Zampini       } else {
295965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
29603e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
296165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
296265e4b4d4SStefano Zampini       }
296365e4b4d4SStefano Zampini       break;
296465e4b4d4SStefano Zampini     default:
296565e4b4d4SStefano Zampini       break;
296665e4b4d4SStefano Zampini     }
296765e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
296865e4b4d4SStefano Zampini   }
296965e4b4d4SStefano Zampini   /* dispatch */
2970fcdce8c4SStefano Zampini   if (isdense) {
2971ccdfe979SStefano Zampini     switch (product->type) {
2972ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2973ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2974ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2975ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2976ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2977fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2978fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2979fcdce8c4SStefano Zampini       } else {
2980fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2981fcdce8c4SStefano Zampini       }
2982fcdce8c4SStefano Zampini       break;
2983fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2984fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2985fcdce8c4SStefano Zampini       break;
2986ccdfe979SStefano Zampini     default:
2987ccdfe979SStefano Zampini       break;
2988ccdfe979SStefano Zampini     }
2989fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2990fcdce8c4SStefano Zampini     switch (product->type) {
2991fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2992fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2993fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2994fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2995fcdce8c4SStefano Zampini       break;
2996fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2997fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2998fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2999fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
3000fcdce8c4SStefano Zampini       break;
3001fcdce8c4SStefano Zampini     default:
3002fcdce8c4SStefano Zampini       break;
3003fcdce8c4SStefano Zampini     }
3004fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
3005fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3006fcdce8c4SStefano Zampini   }
3007ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3008ccdfe979SStefano Zampini }
3009ccdfe979SStefano Zampini 
30106fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
30119ae82921SPaul Mullowney {
3012b175d8bbSPaul Mullowney   PetscErrorCode ierr;
30139ae82921SPaul Mullowney 
30149ae82921SPaul Mullowney   PetscFunctionBegin;
3015e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3016e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3017e6e9a74fSStefano Zampini }
3018e6e9a74fSStefano Zampini 
3019e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3020e6e9a74fSStefano Zampini {
3021e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3022e6e9a74fSStefano Zampini 
3023e6e9a74fSStefano Zampini   PetscFunctionBegin;
3024e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3025e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3026e6e9a74fSStefano Zampini }
3027e6e9a74fSStefano Zampini 
3028e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3029e6e9a74fSStefano Zampini {
3030e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3031e6e9a74fSStefano Zampini 
3032e6e9a74fSStefano Zampini   PetscFunctionBegin;
3033e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3034e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3035e6e9a74fSStefano Zampini }
3036e6e9a74fSStefano Zampini 
3037e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3038e6e9a74fSStefano Zampini {
3039e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3040e6e9a74fSStefano Zampini 
3041e6e9a74fSStefano Zampini   PetscFunctionBegin;
3042e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
30439ae82921SPaul Mullowney   PetscFunctionReturn(0);
30449ae82921SPaul Mullowney }
30459ae82921SPaul Mullowney 
30466fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3047ca45077fSPaul Mullowney {
3048b175d8bbSPaul Mullowney   PetscErrorCode ierr;
3049ca45077fSPaul Mullowney 
3050ca45077fSPaul Mullowney   PetscFunctionBegin;
3051e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3052ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3053ca45077fSPaul Mullowney }
3054ca45077fSPaul Mullowney 
3055a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3056a0e72f99SJunchao Zhang {
3057a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
3058a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3059a0e72f99SJunchao Zhang }
3060a0e72f99SJunchao Zhang 
3061afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3062e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
30639ae82921SPaul Mullowney {
30649ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3065aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
30669ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3067e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3068b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
3069aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
3070e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3071e6e9a74fSStefano Zampini   PetscBool                    compressed;
3072afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3073afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3074afb2bd1cSJunchao Zhang #endif
30756e111a19SKarl Rupp 
30769ae82921SPaul Mullowney   PetscFunctionBegin;
30772c71b3e2SJacob Faibussowitsch   PetscCheckFalse(herm && !trans,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3078e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
3079afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3080d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3081e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3082e6e9a74fSStefano Zampini   }
308334d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
308434d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3085e6e9a74fSStefano Zampini   if (!trans) {
30869ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
30872c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!matstruct,PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3088e6e9a74fSStefano Zampini   } else {
30891a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3090e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3091e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3092e6e9a74fSStefano Zampini     } else {
30933606e59fSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3094e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3095e6e9a74fSStefano Zampini     }
3096e6e9a74fSStefano Zampini   }
3097e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3098e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3099213423ffSJunchao Zhang 
3100e6e9a74fSStefano Zampini   try {
3101e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3102213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3103213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3104afb2bd1cSJunchao Zhang 
310585ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3106e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3107afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3108afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3109afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3110afb2bd1cSJunchao Zhang       */
3111e6e9a74fSStefano Zampini       xptr = xarray;
3112afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3113213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3114afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3115afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3116afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3117afb2bd1cSJunchao Zhang        */
3118afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3119afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3120afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3121afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3122afb2bd1cSJunchao Zhang       }
3123afb2bd1cSJunchao Zhang      #endif
3124e6e9a74fSStefano Zampini     } else {
3125afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3126afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3127afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3128afb2bd1cSJunchao Zhang        */
3129afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3130e6e9a74fSStefano Zampini       dptr = zarray;
3131e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3132afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3133e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3134a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3135e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3136e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3137e6e9a74fSStefano Zampini       }
3138afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3139afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3140afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3141afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3142afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3143afb2bd1cSJunchao Zhang       }
3144afb2bd1cSJunchao Zhang      #endif
3145e6e9a74fSStefano Zampini     }
31469ae82921SPaul Mullowney 
3147afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3148aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3149afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
31502c71b3e2SJacob Faibussowitsch       PetscCheckFalse(opA < 0 || opA > 2,PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3151afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3152ee7b52eaSHong Zhang         cudaError_t cerr;
3153afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3154afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3155afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3156afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3157afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3158afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3159afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3160afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3161afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3162afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3163afb2bd1cSJunchao Zhang 
3164afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3165afb2bd1cSJunchao Zhang       } else {
3166afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3167afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3168afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3169afb2bd1cSJunchao Zhang       }
3170afb2bd1cSJunchao Zhang 
3171afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3172afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
31733606e59fSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3174afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3175afb2bd1cSJunchao Zhang                                beta,
3176afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3177afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3178afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3179afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3180afb2bd1cSJunchao Zhang      #else
31817656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3182e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3183a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3184afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3185aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3186e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
318757d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3188afb2bd1cSJunchao Zhang      #endif
3189aa372e3fSPaul Mullowney     } else {
3190213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3191afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3192afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3193afb2bd1cSJunchao Zhang        #else
3194301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3195e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3196afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3197e6e9a74fSStefano Zampini                                  xptr, beta,
319857d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3199afb2bd1cSJunchao Zhang        #endif
3200a65300a6SPaul Mullowney       }
3201aa372e3fSPaul Mullowney     }
3202958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3203aa372e3fSPaul Mullowney 
3204e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3205213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3206213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3207213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3208e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3209213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
32107656d835SStefano Zampini         }
3211213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3212c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
32137656d835SStefano Zampini       }
32147656d835SStefano Zampini 
3215213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3216213423ffSJunchao Zhang       if (compressed) {
3217e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3218a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3219a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3220a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3221a0e72f99SJunchao Zhang          */
3222a0e72f99SJunchao Zhang        #if 0
3223a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3224a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3225a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3226e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3227c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3228a0e72f99SJunchao Zhang        #else
3229a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3230a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3231a0e72f99SJunchao Zhang        #endif
3232958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3233e6e9a74fSStefano Zampini       }
3234e6e9a74fSStefano Zampini     } else {
3235e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3236e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3237e6e9a74fSStefano Zampini       }
3238e6e9a74fSStefano Zampini     }
3239e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3240213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3241213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
32429ae82921SPaul Mullowney   } catch(char *ex) {
324398921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
32449ae82921SPaul Mullowney   }
3245e6e9a74fSStefano Zampini   if (yy) {
3246958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3247e6e9a74fSStefano Zampini   } else {
3248e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3249e6e9a74fSStefano Zampini   }
32509ae82921SPaul Mullowney   PetscFunctionReturn(0);
32519ae82921SPaul Mullowney }
32529ae82921SPaul Mullowney 
32536fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3254ca45077fSPaul Mullowney {
3255b175d8bbSPaul Mullowney   PetscErrorCode ierr;
32566e111a19SKarl Rupp 
3257ca45077fSPaul Mullowney   PetscFunctionBegin;
3258e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3259ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3260ca45077fSPaul Mullowney }
3261ca45077fSPaul Mullowney 
32626fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
32639ae82921SPaul Mullowney {
32649ae82921SPaul Mullowney   PetscErrorCode     ierr;
3265042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3266042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
32673fa6b06aSMark Adams 
3268042217e8SBarry Smith   PetscFunctionBegin;
3269042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3270042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3271042217e8SBarry Smith     cudaError_t cerr;
3272042217e8SBarry Smith 
3273042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3274042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3275042217e8SBarry Smith     cusp->deviceMat = NULL;
3276042217e8SBarry Smith   }
32779ae82921SPaul Mullowney   PetscFunctionReturn(0);
32789ae82921SPaul Mullowney }
32799ae82921SPaul Mullowney 
32809ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3281e057df02SPaul Mullowney /*@
32829ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3283e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3284e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3285e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3286e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3287e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32889ae82921SPaul Mullowney 
3289d083f849SBarry Smith    Collective
32909ae82921SPaul Mullowney 
32919ae82921SPaul Mullowney    Input Parameters:
32929ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32939ae82921SPaul Mullowney .  m - number of rows
32949ae82921SPaul Mullowney .  n - number of columns
32959ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32969ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32970298fd71SBarry Smith          (possibly different for each row) or NULL
32989ae82921SPaul Mullowney 
32999ae82921SPaul Mullowney    Output Parameter:
33009ae82921SPaul Mullowney .  A - the matrix
33019ae82921SPaul Mullowney 
33029ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
33039ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
33049ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
33059ae82921SPaul Mullowney 
33069ae82921SPaul Mullowney    Notes:
33079ae82921SPaul Mullowney    If nnz is given then nz is ignored
33089ae82921SPaul Mullowney 
33099ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
33109ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
33119ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
33129ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
33139ae82921SPaul Mullowney 
33149ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
33150298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
33169ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
33179ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
33189ae82921SPaul Mullowney 
33199ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
33209ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
33219ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
33229ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
33239ae82921SPaul Mullowney 
33249ae82921SPaul Mullowney    Level: intermediate
33259ae82921SPaul Mullowney 
3326e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
33279ae82921SPaul Mullowney @*/
33289ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
33299ae82921SPaul Mullowney {
33309ae82921SPaul Mullowney   PetscErrorCode ierr;
33319ae82921SPaul Mullowney 
33329ae82921SPaul Mullowney   PetscFunctionBegin;
33339ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
33349ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
33359ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
33369ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
33379ae82921SPaul Mullowney   PetscFunctionReturn(0);
33389ae82921SPaul Mullowney }
33399ae82921SPaul Mullowney 
33406fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
33419ae82921SPaul Mullowney {
33429ae82921SPaul Mullowney   PetscErrorCode ierr;
3343ab25e6cbSDominic Meiser 
33449ae82921SPaul Mullowney   PetscFunctionBegin;
33459ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3346470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
33479ae82921SPaul Mullowney   } else {
3348470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3349aa372e3fSPaul Mullowney   }
3350c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3351ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3352365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3353ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3354ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3355fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3356ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
33577e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
33587e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3359ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
33609ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
33619ae82921SPaul Mullowney   PetscFunctionReturn(0);
33629ae82921SPaul Mullowney }
33639ae82921SPaul Mullowney 
3364ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
336595639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
33669ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
33679ff858a8SKarl Rupp {
33689ff858a8SKarl Rupp   PetscErrorCode ierr;
33699ff858a8SKarl Rupp 
33709ff858a8SKarl Rupp   PetscFunctionBegin;
33719ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3372ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
33739ff858a8SKarl Rupp   PetscFunctionReturn(0);
33749ff858a8SKarl Rupp }
33759ff858a8SKarl Rupp 
3376039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
337795639643SRichard Tran Mills {
3378e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3379a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3380039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3381039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3382039c6fbaSStefano Zampini   PetscScalar        *ay;
3383039c6fbaSStefano Zampini   const PetscScalar  *ax;
3384039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3385e6e9a74fSStefano Zampini 
338695639643SRichard Tran Mills   PetscFunctionBegin;
3387a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3388a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3389039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3390a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3391a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3392a587d139SMark     PetscFunctionReturn(0);
339395639643SRichard Tran Mills   }
3394039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3395a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3396a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
33972c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cy->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
33982c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cx->format != MAT_CUSPARSE_CSR,PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3399039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3400039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3401039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3402039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3403039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3404039c6fbaSStefano Zampini     if (eq) {
3405039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3406039c6fbaSStefano Zampini     }
3407039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3408039c6fbaSStefano Zampini   }
3409d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3410d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3411039c6fbaSStefano Zampini 
3412039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3413039c6fbaSStefano Zampini     cusparseStatus_t stat;
3414039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3415039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3416039c6fbaSStefano Zampini     size_t           bufferSize;
3417039c6fbaSStefano Zampini     void             *buffer;
3418ee7b52eaSHong Zhang     cudaError_t      cerr;
3419039c6fbaSStefano Zampini #endif
3420039c6fbaSStefano Zampini 
3421039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3422039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3423039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3424039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3425039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3426039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3427039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3428039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3429039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3430039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3431039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3432039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3433039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3434039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3435039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3436039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3437039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3438039c6fbaSStefano Zampini #else
3439039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3440039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3441039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3442039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3443039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3444039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3445039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3446039c6fbaSStefano Zampini #endif
3447039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3448039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3449039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3450039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3451039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3452a587d139SMark     cublasHandle_t cublasv2handle;
3453039c6fbaSStefano Zampini     cublasStatus_t berr;
3454a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3455039c6fbaSStefano Zampini 
3456039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3457039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3458a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3459a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3460a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3461039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3462a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3463a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3464039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3465039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3466a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3467039c6fbaSStefano Zampini   } else {
3468a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3469d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3470a587d139SMark   }
347195639643SRichard Tran Mills   PetscFunctionReturn(0);
347295639643SRichard Tran Mills }
347395639643SRichard Tran Mills 
347433c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
347533c9ba73SStefano Zampini {
347633c9ba73SStefano Zampini   PetscErrorCode ierr;
347733c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
347833c9ba73SStefano Zampini   PetscScalar    *ay;
347933c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
348033c9ba73SStefano Zampini   cublasStatus_t berr;
348133c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
348233c9ba73SStefano Zampini 
348333c9ba73SStefano Zampini   PetscFunctionBegin;
348433c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
348533c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
348633c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
348733c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
348833c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
348933c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
349033c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349133c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
349233c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
349333c9ba73SStefano Zampini   PetscFunctionReturn(0);
349433c9ba73SStefano Zampini }
349533c9ba73SStefano Zampini 
34963fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34973fa6b06aSMark Adams {
34983fa6b06aSMark Adams   PetscErrorCode ierr;
34997e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3500a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
35017e8381f9SStefano Zampini 
35023fa6b06aSMark Adams   PetscFunctionBegin;
35033fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
35043fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
35057e8381f9SStefano Zampini     if (spptr->mat) {
35067e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
35077e8381f9SStefano Zampini       if (matrix->values) {
35087e8381f9SStefano Zampini         both = PETSC_TRUE;
35097e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35107e8381f9SStefano Zampini       }
35117e8381f9SStefano Zampini     }
35127e8381f9SStefano Zampini     if (spptr->matTranspose) {
35137e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
35147e8381f9SStefano Zampini       if (matrix->values) {
35157e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35167e8381f9SStefano Zampini       }
35177e8381f9SStefano Zampini     }
35183fa6b06aSMark Adams   }
3519a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3520a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3521a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
35227e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3523a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
35243fa6b06aSMark Adams   PetscFunctionReturn(0);
35253fa6b06aSMark Adams }
35263fa6b06aSMark Adams 
3527a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3528a587d139SMark {
3529a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3530a587d139SMark   PetscErrorCode ierr;
3531a587d139SMark 
3532a587d139SMark   PetscFunctionBegin;
35339a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
35349a14fc28SStefano Zampini     A->boundtocpu = flg;
35359a14fc28SStefano Zampini     PetscFunctionReturn(0);
35369a14fc28SStefano Zampini   }
3537a587d139SMark   if (flg) {
3538a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3539a587d139SMark 
354033c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3541a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3542a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3543a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3544a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3545a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3546a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3547a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3548a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3549fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
355067a45760SJunchao Zhang     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3551c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3552a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3553a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3554a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3555a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3556a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3557fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3558a587d139SMark   } else {
355933c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3560a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3561a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3562a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3563a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3564a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3565a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3566a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3567a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3568fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
356967a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
357067a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
357167a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
357267a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
357367a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
357467a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3575c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3580fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3581a587d139SMark   }
3582a587d139SMark   A->boundtocpu = flg;
3583ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3584ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3585ea500dcfSRichard Tran Mills   } else {
3586ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3587ea500dcfSRichard Tran Mills   }
3588a587d139SMark   PetscFunctionReturn(0);
3589a587d139SMark }
3590a587d139SMark 
359149735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
35929ae82921SPaul Mullowney {
35939ae82921SPaul Mullowney   PetscErrorCode   ierr;
3594aa372e3fSPaul Mullowney   cusparseStatus_t stat;
359549735bf3SStefano Zampini   Mat              B;
35969ae82921SPaul Mullowney 
35979ae82921SPaul Mullowney   PetscFunctionBegin;
3598a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
359949735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
360049735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
360149735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
360249735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
360349735bf3SStefano Zampini   }
360449735bf3SStefano Zampini   B = *newmat;
360549735bf3SStefano Zampini 
360634136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
360734136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
360834136279SStefano Zampini 
360949735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
36109ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3611e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3612e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3613e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3614a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
36151a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3616d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3617a435da06SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3618a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3619a435da06SStefano Zampini      #else
3620d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3621a435da06SStefano Zampini      #endif
3622d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3623d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3624d8132acaSStefano Zampini      #endif
36251a2c6b5cSJunchao Zhang       B->spptr = spptr;
36269ae82921SPaul Mullowney     } else {
3627e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3628e6e9a74fSStefano Zampini 
3629e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3630e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3631a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3632e6e9a74fSStefano Zampini       B->spptr = spptr;
36339ae82921SPaul Mullowney     }
3634e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
363549735bf3SStefano Zampini   }
3636693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
36379ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
36381a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
36399ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
364095639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3641693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
36422205254eSKarl Rupp 
3643e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
36449ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3645bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3646ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
3647ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3648ae48a8d0SStefano Zampini #endif
3649365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
36509ae82921SPaul Mullowney   PetscFunctionReturn(0);
36519ae82921SPaul Mullowney }
36529ae82921SPaul Mullowney 
365302fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
365402fe1965SBarry Smith {
365502fe1965SBarry Smith   PetscErrorCode ierr;
365602fe1965SBarry Smith 
365702fe1965SBarry Smith   PetscFunctionBegin;
365802fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
36590ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
366002fe1965SBarry Smith   PetscFunctionReturn(0);
366102fe1965SBarry Smith }
366202fe1965SBarry Smith 
36633ca39a21SBarry Smith /*MC
3664e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3665e057df02SPaul Mullowney 
3666e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
36672692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
36682692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3669e057df02SPaul Mullowney 
3670e057df02SPaul Mullowney    Options Database Keys:
3671e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3672aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3673a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3674365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3675e057df02SPaul Mullowney 
3676e057df02SPaul Mullowney   Level: beginner
3677e057df02SPaul Mullowney 
36788468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3679e057df02SPaul Mullowney M*/
36807f756511SDominic Meiser 
3681bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
36820f39cd5aSBarry Smith 
36833ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
368442c9c57cSBarry Smith {
368542c9c57cSBarry Smith   PetscErrorCode ierr;
368642c9c57cSBarry Smith 
368742c9c57cSBarry Smith   PetscFunctionBegin;
3688bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
36893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36903ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36913ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36923ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3693bddcd29dSMark Adams 
369442c9c57cSBarry Smith   PetscFunctionReturn(0);
369542c9c57cSBarry Smith }
369629b38603SBarry Smith 
3697470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36987f756511SDominic Meiser {
3699e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
37007f756511SDominic Meiser   cusparseStatus_t stat;
3701219fbbafSJunchao Zhang   cudaError_t      cerr;
37027f756511SDominic Meiser 
37037f756511SDominic Meiser   PetscFunctionBegin;
37047f756511SDominic Meiser   if (*cusparsestruct) {
3705e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3706e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
37077f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
370881902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
37097e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
37107e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3711a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
37127e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3713219fbbafSJunchao Zhang     if ((*cusparsestruct)->use_extended_coo) {
3714219fbbafSJunchao Zhang       cerr = cudaFree((*cusparsestruct)->jmap_d);CHKERRCUDA(cerr);
3715219fbbafSJunchao Zhang       cerr = cudaFree((*cusparsestruct)->perm_d);CHKERRCUDA(cerr);
3716219fbbafSJunchao Zhang     }
3717e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
37187f756511SDominic Meiser   }
37197f756511SDominic Meiser   PetscFunctionReturn(0);
37207f756511SDominic Meiser }
37217f756511SDominic Meiser 
37227f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
37237f756511SDominic Meiser {
37247f756511SDominic Meiser   PetscFunctionBegin;
37257f756511SDominic Meiser   if (*mat) {
37267f756511SDominic Meiser     delete (*mat)->values;
37277f756511SDominic Meiser     delete (*mat)->column_indices;
37287f756511SDominic Meiser     delete (*mat)->row_offsets;
37297f756511SDominic Meiser     delete *mat;
37307f756511SDominic Meiser     *mat = 0;
37317f756511SDominic Meiser   }
37327f756511SDominic Meiser   PetscFunctionReturn(0);
37337f756511SDominic Meiser }
37347f756511SDominic Meiser 
3735470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
37367f756511SDominic Meiser {
37377f756511SDominic Meiser   cusparseStatus_t stat;
37387f756511SDominic Meiser   PetscErrorCode   ierr;
37397f756511SDominic Meiser 
37407f756511SDominic Meiser   PetscFunctionBegin;
37417f756511SDominic Meiser   if (*trifactor) {
374257d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3743afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
37447f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
37451b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
37462cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3747afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
37481b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3749afb2bd1cSJunchao Zhang    #endif
3750da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
37517f756511SDominic Meiser   }
37527f756511SDominic Meiser   PetscFunctionReturn(0);
37537f756511SDominic Meiser }
37547f756511SDominic Meiser 
3755470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
37567f756511SDominic Meiser {
37577f756511SDominic Meiser   CsrMatrix        *mat;
37587f756511SDominic Meiser   cusparseStatus_t stat;
37597f756511SDominic Meiser   cudaError_t      err;
37607f756511SDominic Meiser 
37617f756511SDominic Meiser   PetscFunctionBegin;
37627f756511SDominic Meiser   if (*matstruct) {
37637f756511SDominic Meiser     if ((*matstruct)->mat) {
37647f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3765afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3766afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3767afb2bd1cSJunchao Zhang        #else
37687f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
376957d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3770afb2bd1cSJunchao Zhang        #endif
37717f756511SDominic Meiser       } else {
37727f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
37737f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
37747f756511SDominic Meiser       }
37757f756511SDominic Meiser     }
377657d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
37777f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3778afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
37797656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
37807656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3781afb2bd1cSJunchao Zhang 
3782afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3783afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3784afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3785afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3786afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3787afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3788afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3789afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3790afb2bd1cSJunchao Zhang       }
3791afb2bd1cSJunchao Zhang     }
3792afb2bd1cSJunchao Zhang    #endif
37937f756511SDominic Meiser     delete *matstruct;
37947e8381f9SStefano Zampini     *matstruct = NULL;
37957f756511SDominic Meiser   }
37967f756511SDominic Meiser   PetscFunctionReturn(0);
37977f756511SDominic Meiser }
37987f756511SDominic Meiser 
3799e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
38007f756511SDominic Meiser {
3801e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3802e6e9a74fSStefano Zampini 
38037f756511SDominic Meiser   PetscFunctionBegin;
38047f756511SDominic Meiser   if (*trifactors) {
3805e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3806e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3807e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3808e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
38097f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
38107f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
38117f756511SDominic Meiser     delete (*trifactors)->workVector;
38127e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
38137e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
38147e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3815bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3816bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3817e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3818ccdfe979SStefano Zampini   }
3819ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3820ccdfe979SStefano Zampini }
3821ccdfe979SStefano Zampini 
3822ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3823ccdfe979SStefano Zampini {
3824e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3825ccdfe979SStefano Zampini   cusparseHandle_t handle;
3826ccdfe979SStefano Zampini   cusparseStatus_t stat;
3827ccdfe979SStefano Zampini 
3828ccdfe979SStefano Zampini   PetscFunctionBegin;
3829ccdfe979SStefano Zampini   if (*trifactors) {
3830e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
38317f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
383257d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
38337f756511SDominic Meiser     }
3834e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
38357f756511SDominic Meiser   }
38367f756511SDominic Meiser   PetscFunctionReturn(0);
38377f756511SDominic Meiser }
38387e8381f9SStefano Zampini 
38397e8381f9SStefano Zampini struct IJCompare
38407e8381f9SStefano Zampini {
38417e8381f9SStefano Zampini   __host__ __device__
38427e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38437e8381f9SStefano Zampini   {
38447e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
38457e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
38467e8381f9SStefano Zampini     return false;
38477e8381f9SStefano Zampini   }
38487e8381f9SStefano Zampini };
38497e8381f9SStefano Zampini 
38507e8381f9SStefano Zampini struct IJEqual
38517e8381f9SStefano Zampini {
38527e8381f9SStefano Zampini   __host__ __device__
38537e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38547e8381f9SStefano Zampini   {
38557e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
38567e8381f9SStefano Zampini     return true;
38577e8381f9SStefano Zampini   }
38587e8381f9SStefano Zampini };
38597e8381f9SStefano Zampini 
38607e8381f9SStefano Zampini struct IJDiff
38617e8381f9SStefano Zampini {
38627e8381f9SStefano Zampini   __host__ __device__
38637e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38647e8381f9SStefano Zampini   {
38657e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
38667e8381f9SStefano Zampini   }
38677e8381f9SStefano Zampini };
38687e8381f9SStefano Zampini 
38697e8381f9SStefano Zampini struct IJSum
38707e8381f9SStefano Zampini {
38717e8381f9SStefano Zampini   __host__ __device__
38727e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38737e8381f9SStefano Zampini   {
38747e8381f9SStefano Zampini     return t1||t2;
38757e8381f9SStefano Zampini   }
38767e8381f9SStefano Zampini };
38777e8381f9SStefano Zampini 
38787e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3879219fbbafSJunchao Zhang /* Associated with MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic() */
3880219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
38817e8381f9SStefano Zampini {
38827e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3883fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3884bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
388508391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
38867e8381f9SStefano Zampini   CsrMatrix                             *matrix;
38877e8381f9SStefano Zampini   PetscErrorCode                        ierr;
38887e8381f9SStefano Zampini   PetscInt                              n;
38897e8381f9SStefano Zampini 
38907e8381f9SStefano Zampini   PetscFunctionBegin;
38912c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusp,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
38922c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
38937e8381f9SStefano Zampini   if (!cusp->cooPerm) {
38947e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38957e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38967e8381f9SStefano Zampini     PetscFunctionReturn(0);
38977e8381f9SStefano Zampini   }
38987e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
38992c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!matrix->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3900e61fc153SStefano Zampini   if (!v) {
3901e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3902e61fc153SStefano Zampini     goto finalize;
39037e8381f9SStefano Zampini   }
3904e61fc153SStefano Zampini   n = cusp->cooPerm->size();
390508391a17SStefano Zampini   if (isCudaMem(v)) {
390608391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
390708391a17SStefano Zampini   } else {
3908e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3909e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
391008391a17SStefano Zampini     d_v = cooPerm_v->data();
3911e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
391208391a17SStefano Zampini   }
3913bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3914e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3915ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3916bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
391708391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3918ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3919ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3920ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3921ddea5d60SJunchao Zhang       */
3922e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3923e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3924e61fc153SStefano Zampini       delete cooPerm_w;
39257e8381f9SStefano Zampini     } else {
3926ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
392708391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39287e8381f9SStefano Zampini                                                                 matrix->values->begin()));
392908391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39307e8381f9SStefano Zampini                                                                 matrix->values->end()));
3931ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
39327e8381f9SStefano Zampini     }
39337e8381f9SStefano Zampini   } else {
3934e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
393508391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3936e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
39377e8381f9SStefano Zampini     } else {
393808391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39397e8381f9SStefano Zampini                                                                 matrix->values->begin()));
394008391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39417e8381f9SStefano Zampini                                                                 matrix->values->end()));
39427e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
39437e8381f9SStefano Zampini     }
39447e8381f9SStefano Zampini   }
3945bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3946e61fc153SStefano Zampini finalize:
3947e61fc153SStefano Zampini   delete cooPerm_v;
39487e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3949e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3950fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
39517d3de750SJacob Faibussowitsch   ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3952fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
39537d3de750SJacob Faibussowitsch   ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3954fcdce8c4SStefano Zampini   a->reallocs         = 0;
3955fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3956fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3957fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3958fcdce8c4SStefano Zampini   A->num_ass++;
39597e8381f9SStefano Zampini   PetscFunctionReturn(0);
39607e8381f9SStefano Zampini }
39617e8381f9SStefano Zampini 
3962a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3963a49f1ed0SStefano Zampini {
3964a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3965a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3966a49f1ed0SStefano Zampini 
3967a49f1ed0SStefano Zampini   PetscFunctionBegin;
3968a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3969a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3970a49f1ed0SStefano Zampini   if (destroy) {
3971a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3972a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3973a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3974a49f1ed0SStefano Zampini   }
39751a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3976a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3977a49f1ed0SStefano Zampini }
3978a49f1ed0SStefano Zampini 
39797e8381f9SStefano Zampini #include <thrust/binary_search.h>
3980219fbbafSJunchao Zhang /* 'Basic' means it only works when coo_i[] and coo_j[] do not contain negative indices */
3981219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
39827e8381f9SStefano Zampini {
39837e8381f9SStefano Zampini   PetscErrorCode     ierr;
39847e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
39857e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
39867e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
39877e8381f9SStefano Zampini   cudaError_t        cerr;
39887e8381f9SStefano Zampini 
39897e8381f9SStefano Zampini   PetscFunctionBegin;
39907e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
39917e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
39927e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
39937e8381f9SStefano Zampini   if (n != cooPerm_n) {
39947e8381f9SStefano Zampini     delete cusp->cooPerm;
39957e8381f9SStefano Zampini     delete cusp->cooPerm_a;
39967e8381f9SStefano Zampini     cusp->cooPerm = NULL;
39977e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
39987e8381f9SStefano Zampini   }
39997e8381f9SStefano Zampini   if (n) {
40007e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
40017e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
40027e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
40037e8381f9SStefano Zampini 
40047e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
40057e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
40067e8381f9SStefano Zampini 
40077e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
40087e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
40097e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
4010ddea5d60SJunchao Zhang 
4011ddea5d60SJunchao Zhang     /* Ex.
4012ddea5d60SJunchao Zhang       n = 6
4013ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4014ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4015ddea5d60SJunchao Zhang     */
40167e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
40177e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
40187e8381f9SStefano Zampini 
401908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
40207e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4021ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4022ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
40237e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
40247e8381f9SStefano Zampini 
4025ddea5d60SJunchao Zhang     /*
4026ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4027ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4028ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4029ddea5d60SJunchao Zhang     */
4030ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4031ddea5d60SJunchao Zhang 
4032ddea5d60SJunchao Zhang     /*
4033ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4034ddea5d60SJunchao Zhang                             ^ekey
4035ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4036ddea5d60SJunchao Zhang                            ^nekye
4037ddea5d60SJunchao Zhang     */
40387e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
40397e8381f9SStefano Zampini       delete cusp->cooPerm_a;
40407e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4041ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4042ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4043ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4044ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4045ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
40467e8381f9SStefano Zampini       w[0] = 0;
4047ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4048ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
40497e8381f9SStefano Zampini     }
40507e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4051ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4052ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4053ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
405408391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40557e8381f9SStefano Zampini 
40567e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
40577e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
40587e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
40597e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
40607e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4061ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
40627e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40637e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4064fcdce8c4SStefano Zampini     a->rmax = 0;
40657e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
40667e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
40677e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40687e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
40697e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
40707e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
40717e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
40727e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
40737e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4074fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
40757e8381f9SStefano Zampini     }
4076fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
40777e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
40787e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4079fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
40807e8381f9SStefano Zampini   } else {
40817e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
40827e8381f9SStefano Zampini   }
4083e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
40847e8381f9SStefano Zampini 
40857e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4086e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
4087e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
40887e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
40897e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
40907e8381f9SStefano Zampini   A->nonzerostate++;
40917e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4092a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
40937e8381f9SStefano Zampini 
40947e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
40957e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
40967e8381f9SStefano Zampini   PetscFunctionReturn(0);
40977e8381f9SStefano Zampini }
4098ed502f03SStefano Zampini 
4099219fbbafSJunchao Zhang PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
4100219fbbafSJunchao Zhang {
4101219fbbafSJunchao Zhang   PetscErrorCode     ierr;
4102219fbbafSJunchao Zhang   cudaError_t        cerr;
4103219fbbafSJunchao Zhang   Mat_SeqAIJ         *seq;
4104219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE *dev;
4105219fbbafSJunchao Zhang   Mat                newmat;
4106219fbbafSJunchao Zhang   PetscInt           coo_basic = 1;
4107219fbbafSJunchao Zhang   PetscMemType       mtype = PETSC_MEMTYPE_DEVICE;
4108219fbbafSJunchao Zhang 
4109219fbbafSJunchao Zhang   PetscFunctionBegin;
4110219fbbafSJunchao Zhang   if (coo_i) {
4111219fbbafSJunchao Zhang     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
4112219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
4113219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) {
4114219fbbafSJunchao Zhang         if (coo_i[k] < 0 || coo_j[k] < 0) {coo_basic = 0; break;}
4115219fbbafSJunchao Zhang       }
4116219fbbafSJunchao Zhang     }
4117219fbbafSJunchao Zhang   }
4118219fbbafSJunchao Zhang 
4119219fbbafSJunchao Zhang   if (coo_basic) { /* i,j are on device or do not contain negative indices */
4120219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
4121219fbbafSJunchao Zhang   } else {
4122219fbbafSJunchao Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
4123219fbbafSJunchao Zhang     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
4124219fbbafSJunchao Zhang     ierr = MatSetType(newmat,MATSEQAIJ);CHKERRQ(ierr);
4125219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_SeqAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
4126219fbbafSJunchao Zhang     ierr = MatConvert(newmat,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
4127219fbbafSJunchao Zhang     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
4128219fbbafSJunchao Zhang     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
4129219fbbafSJunchao Zhang 
4130219fbbafSJunchao Zhang     seq  = static_cast<Mat_SeqAIJ*>(mat->data);
4131219fbbafSJunchao Zhang     dev  = static_cast<Mat_SeqAIJCUSPARSE*>(mat->spptr);
4132219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&dev->jmap_d,(seq->nz+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
4133219fbbafSJunchao Zhang     cerr = cudaMemcpy(dev->jmap_d,seq->jmap,(seq->nz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4134219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&dev->perm_d,seq->Atot*sizeof(PetscCount));CHKERRCUDA(cerr);
4135219fbbafSJunchao Zhang     cerr = cudaMemcpy(dev->perm_d,seq->perm,seq->Atot*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4136219fbbafSJunchao Zhang     dev->use_extended_coo = PETSC_TRUE;
4137219fbbafSJunchao Zhang   }
4138219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4139219fbbafSJunchao Zhang }
4140219fbbafSJunchao Zhang 
4141b6c38306SJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount jmap[],const PetscCount perm[],InsertMode imode,PetscScalar a[])
4142219fbbafSJunchao Zhang {
4143219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
4144219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
4145b6c38306SJunchao Zhang   for (; i<nnz; i+= grid_size) {
4146b6c38306SJunchao Zhang     PetscScalar sum = 0.0;
4147b6c38306SJunchao Zhang     for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) sum += kv[perm[k]];
4148b6c38306SJunchao Zhang     a[i] = (imode == INSERT_VALUES? 0.0 : a[i]) + sum;
4149b6c38306SJunchao Zhang   }
4150219fbbafSJunchao Zhang }
4151219fbbafSJunchao Zhang 
4152219fbbafSJunchao Zhang PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
4153219fbbafSJunchao Zhang {
4154219fbbafSJunchao Zhang   PetscErrorCode      ierr;
4155219fbbafSJunchao Zhang   cudaError_t         cerr;
4156219fbbafSJunchao Zhang   Mat_SeqAIJ          *seq = (Mat_SeqAIJ*)A->data;
4157219fbbafSJunchao Zhang   Mat_SeqAIJCUSPARSE  *dev = (Mat_SeqAIJCUSPARSE*)A->spptr;
4158219fbbafSJunchao Zhang   PetscCount          Annz = seq->nz;
4159219fbbafSJunchao Zhang   PetscMemType        memtype;
4160219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
4161219fbbafSJunchao Zhang   PetscScalar         *Aa;
4162219fbbafSJunchao Zhang 
4163219fbbafSJunchao Zhang   PetscFunctionBegin;
4164219fbbafSJunchao Zhang   if (dev->use_extended_coo) {
4165219fbbafSJunchao Zhang     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
4166219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
4167219fbbafSJunchao Zhang       cerr = cudaMalloc((void**)&v1,seq->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
4168*7487cd7cSJunchao Zhang       cerr = cudaMemcpy((void*)v1,v,seq->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4169219fbbafSJunchao Zhang     }
4170219fbbafSJunchao Zhang 
4171b6c38306SJunchao Zhang     if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr);}
4172b6c38306SJunchao Zhang     else {ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr);}
4173219fbbafSJunchao Zhang 
4174b6c38306SJunchao Zhang     MatAddCOOValues<<<(Annz+255)/256,256>>>(v1,Annz,dev->jmap_d,dev->perm_d,imode,Aa);
4175219fbbafSJunchao Zhang 
4176219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);}
4177219fbbafSJunchao Zhang     else {ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);}
4178219fbbafSJunchao Zhang 
4179219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
4180219fbbafSJunchao Zhang   } else {
4181219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(A,v,imode);CHKERRQ(ierr);
4182219fbbafSJunchao Zhang   }
4183219fbbafSJunchao Zhang   PetscFunctionReturn(0);
4184219fbbafSJunchao Zhang }
4185219fbbafSJunchao Zhang 
41865b7e41feSStefano Zampini /*@C
41875b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
41885b7e41feSStefano Zampini 
41895b7e41feSStefano Zampini    Not collective
41905b7e41feSStefano Zampini 
41915b7e41feSStefano Zampini     Input Parameters:
41925b7e41feSStefano Zampini +   A - the matrix
41935b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41945b7e41feSStefano Zampini 
41955b7e41feSStefano Zampini     Output Parameters:
41965b7e41feSStefano Zampini +   ia - the CSR row pointers
41975b7e41feSStefano Zampini -   ja - the CSR column indices
41985b7e41feSStefano Zampini 
41995b7e41feSStefano Zampini     Level: developer
42005b7e41feSStefano Zampini 
42015b7e41feSStefano Zampini     Notes:
42025b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
42035b7e41feSStefano Zampini 
42045b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
42055b7e41feSStefano Zampini @*/
42065f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
42075f101d05SStefano Zampini {
42085f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
42095f101d05SStefano Zampini   CsrMatrix          *csr;
42105f101d05SStefano Zampini   PetscErrorCode     ierr;
42115f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
42125f101d05SStefano Zampini 
42135f101d05SStefano Zampini   PetscFunctionBegin;
42145f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
42155f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
42165f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42172c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
42185f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
42192c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
42205f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
42215f101d05SStefano Zampini   if (i) {
42225f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
42235f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
42245f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
42255f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
42265f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
42275f101d05SStefano Zampini       }
42285f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
42295f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
42305f101d05SStefano Zampini   }
42315f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
42325f101d05SStefano Zampini   PetscFunctionReturn(0);
42335f101d05SStefano Zampini }
42345f101d05SStefano Zampini 
42355b7e41feSStefano Zampini /*@C
42365b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
42375b7e41feSStefano Zampini 
42385b7e41feSStefano Zampini    Not collective
42395b7e41feSStefano Zampini 
42405b7e41feSStefano Zampini     Input Parameters:
42415b7e41feSStefano Zampini +   A - the matrix
42425b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
42435b7e41feSStefano Zampini 
42445b7e41feSStefano Zampini     Output Parameters:
42455b7e41feSStefano Zampini +   ia - the CSR row pointers
42465b7e41feSStefano Zampini -   ja - the CSR column indices
42475b7e41feSStefano Zampini 
42485b7e41feSStefano Zampini     Level: developer
42495b7e41feSStefano Zampini 
42505b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
42515b7e41feSStefano Zampini @*/
42525f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
42535f101d05SStefano Zampini {
42545f101d05SStefano Zampini   PetscFunctionBegin;
42555f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
42565f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42575f101d05SStefano Zampini   if (i) *i = NULL;
42585f101d05SStefano Zampini   if (j) *j = NULL;
42595f101d05SStefano Zampini   PetscFunctionReturn(0);
42605f101d05SStefano Zampini }
42615f101d05SStefano Zampini 
42625b7e41feSStefano Zampini /*@C
42635b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42645b7e41feSStefano Zampini 
42655b7e41feSStefano Zampini    Not Collective
42665b7e41feSStefano Zampini 
42675b7e41feSStefano Zampini    Input Parameter:
42685b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42695b7e41feSStefano Zampini 
42705b7e41feSStefano Zampini    Output Parameter:
42715b7e41feSStefano Zampini .   a - pointer to the device data
42725b7e41feSStefano Zampini 
42735b7e41feSStefano Zampini    Level: developer
42745b7e41feSStefano Zampini 
42755b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42765b7e41feSStefano Zampini 
42775b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
42785b7e41feSStefano Zampini @*/
4279ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4280ed502f03SStefano Zampini {
4281ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4282ed502f03SStefano Zampini   CsrMatrix          *csr;
4283ed502f03SStefano Zampini   PetscErrorCode     ierr;
4284ed502f03SStefano Zampini 
4285ed502f03SStefano Zampini   PetscFunctionBegin;
4286ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4287ed502f03SStefano Zampini   PetscValidPointer(a,2);
4288ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
42892c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4290ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
42912c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4292ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
42932c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4294ed502f03SStefano Zampini   *a = csr->values->data().get();
4295ed502f03SStefano Zampini   PetscFunctionReturn(0);
4296ed502f03SStefano Zampini }
4297ed502f03SStefano Zampini 
42985b7e41feSStefano Zampini /*@C
42995b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
43005b7e41feSStefano Zampini 
43015b7e41feSStefano Zampini    Not Collective
43025b7e41feSStefano Zampini 
43035b7e41feSStefano Zampini    Input Parameter:
43045b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43055b7e41feSStefano Zampini 
43065b7e41feSStefano Zampini    Output Parameter:
43075b7e41feSStefano Zampini .   a - pointer to the device data
43085b7e41feSStefano Zampini 
43095b7e41feSStefano Zampini    Level: developer
43105b7e41feSStefano Zampini 
43115b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
43125b7e41feSStefano Zampini @*/
4313ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4314ed502f03SStefano Zampini {
4315ed502f03SStefano Zampini   PetscFunctionBegin;
4316ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4317ed502f03SStefano Zampini   PetscValidPointer(a,2);
4318ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4319ed502f03SStefano Zampini   *a = NULL;
4320ed502f03SStefano Zampini   PetscFunctionReturn(0);
4321ed502f03SStefano Zampini }
4322ed502f03SStefano Zampini 
43235b7e41feSStefano Zampini /*@C
43245b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
43255b7e41feSStefano Zampini 
43265b7e41feSStefano Zampini    Not Collective
43275b7e41feSStefano Zampini 
43285b7e41feSStefano Zampini    Input Parameter:
43295b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43305b7e41feSStefano Zampini 
43315b7e41feSStefano Zampini    Output Parameter:
43325b7e41feSStefano Zampini .   a - pointer to the device data
43335b7e41feSStefano Zampini 
43345b7e41feSStefano Zampini    Level: developer
43355b7e41feSStefano Zampini 
43365b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
43375b7e41feSStefano Zampini 
43385b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
43395b7e41feSStefano Zampini @*/
4340039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4341039c6fbaSStefano Zampini {
4342039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4343039c6fbaSStefano Zampini   CsrMatrix          *csr;
4344039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4345039c6fbaSStefano Zampini 
4346039c6fbaSStefano Zampini   PetscFunctionBegin;
4347039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4348039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4349039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
43502c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4351039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
43522c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4353039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
43542c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4355039c6fbaSStefano Zampini   *a = csr->values->data().get();
4356039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4357a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4358039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4359039c6fbaSStefano Zampini }
43605b7e41feSStefano Zampini /*@C
43615b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4362039c6fbaSStefano Zampini 
43635b7e41feSStefano Zampini    Not Collective
43645b7e41feSStefano Zampini 
43655b7e41feSStefano Zampini    Input Parameter:
43665b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43675b7e41feSStefano Zampini 
43685b7e41feSStefano Zampini    Output Parameter:
43695b7e41feSStefano Zampini .   a - pointer to the device data
43705b7e41feSStefano Zampini 
43715b7e41feSStefano Zampini    Level: developer
43725b7e41feSStefano Zampini 
43735b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
43745b7e41feSStefano Zampini @*/
4375039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4376039c6fbaSStefano Zampini {
4377039c6fbaSStefano Zampini   PetscErrorCode ierr;
4378039c6fbaSStefano Zampini 
4379039c6fbaSStefano Zampini   PetscFunctionBegin;
4380039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4381039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4382039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4383219fbbafSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
4384039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4385039c6fbaSStefano Zampini   *a = NULL;
4386039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4387039c6fbaSStefano Zampini }
4388039c6fbaSStefano Zampini 
43895b7e41feSStefano Zampini /*@C
43905b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
43915b7e41feSStefano Zampini 
43925b7e41feSStefano Zampini    Not Collective
43935b7e41feSStefano Zampini 
43945b7e41feSStefano Zampini    Input Parameter:
43955b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43965b7e41feSStefano Zampini 
43975b7e41feSStefano Zampini    Output Parameter:
43985b7e41feSStefano Zampini .   a - pointer to the device data
43995b7e41feSStefano Zampini 
44005b7e41feSStefano Zampini    Level: developer
44015b7e41feSStefano Zampini 
44025b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
44035b7e41feSStefano Zampini 
44045b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
44055b7e41feSStefano Zampini @*/
4406ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4407ed502f03SStefano Zampini {
4408ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4409ed502f03SStefano Zampini   CsrMatrix          *csr;
4410a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4411ed502f03SStefano Zampini 
4412ed502f03SStefano Zampini   PetscFunctionBegin;
4413ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4414ed502f03SStefano Zampini   PetscValidPointer(a,2);
4415ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
44162c71b3e2SJacob Faibussowitsch   PetscCheckFalse(cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
44172c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4418ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
44192c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!csr->values,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4420ed502f03SStefano Zampini   *a = csr->values->data().get();
4421039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4422a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4423ed502f03SStefano Zampini   PetscFunctionReturn(0);
4424ed502f03SStefano Zampini }
4425ed502f03SStefano Zampini 
44265b7e41feSStefano Zampini /*@C
44275b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
44285b7e41feSStefano Zampini 
44295b7e41feSStefano Zampini    Not Collective
44305b7e41feSStefano Zampini 
44315b7e41feSStefano Zampini    Input Parameter:
44325b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
44335b7e41feSStefano Zampini 
44345b7e41feSStefano Zampini    Output Parameter:
44355b7e41feSStefano Zampini .   a - pointer to the device data
44365b7e41feSStefano Zampini 
44375b7e41feSStefano Zampini    Level: developer
44385b7e41feSStefano Zampini 
44395b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
44405b7e41feSStefano Zampini @*/
4441ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4442ed502f03SStefano Zampini {
4443ed502f03SStefano Zampini   PetscErrorCode ierr;
4444ed502f03SStefano Zampini 
4445ed502f03SStefano Zampini   PetscFunctionBegin;
4446ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4447ed502f03SStefano Zampini   PetscValidPointer(a,2);
4448ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4449219fbbafSJunchao Zhang   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
4450ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4451ed502f03SStefano Zampini   *a = NULL;
4452ed502f03SStefano Zampini   PetscFunctionReturn(0);
4453ed502f03SStefano Zampini }
4454ed502f03SStefano Zampini 
4455ed502f03SStefano Zampini struct IJCompare4
4456ed502f03SStefano Zampini {
4457ed502f03SStefano Zampini   __host__ __device__
44582ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4459ed502f03SStefano Zampini   {
4460ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4461ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4462ed502f03SStefano Zampini     return false;
4463ed502f03SStefano Zampini   }
4464ed502f03SStefano Zampini };
4465ed502f03SStefano Zampini 
44668909a122SStefano Zampini struct Shift
44678909a122SStefano Zampini {
4468ed502f03SStefano Zampini   int _shift;
4469ed502f03SStefano Zampini 
4470ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4471ed502f03SStefano Zampini   __host__ __device__
4472ed502f03SStefano Zampini   inline int operator() (const int &c)
4473ed502f03SStefano Zampini   {
4474ed502f03SStefano Zampini     return c + _shift;
4475ed502f03SStefano Zampini   }
4476ed502f03SStefano Zampini };
4477ed502f03SStefano Zampini 
4478ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4479ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4480ed502f03SStefano Zampini {
4481ed502f03SStefano Zampini   PetscErrorCode               ierr;
4482ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4483ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4484ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4485ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4486ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4487ed502f03SStefano Zampini   cusparseStatus_t             stat;
4488ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4489ed502f03SStefano Zampini   cudaError_t                  cerr;
4490ed502f03SStefano Zampini 
4491ed502f03SStefano Zampini   PetscFunctionBegin;
4492ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4493ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4494ed502f03SStefano Zampini   PetscValidPointer(C,4);
4495ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4496ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
44972c71b3e2SJacob Faibussowitsch   PetscCheckFalse(A->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
44982c71b3e2SJacob Faibussowitsch   PetscCheckFalse(reuse == MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
44992c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
45002c71b3e2SJacob Faibussowitsch   PetscCheckFalse(Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4501ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4502ed502f03SStefano Zampini     m     = A->rmap->n;
4503ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4504ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4505ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4506ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4507ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4508ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4509ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4510ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4511ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4512ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4513ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4514ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4515ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4516ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4517ed502f03SStefano Zampini     Ccusp->nrows    = m;
4518ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4519ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4520ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4521ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4522ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4523ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4524ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4525ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4526ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4527ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4528ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4529ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4530ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4531ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4532ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
45332c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
45342c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4535ed502f03SStefano Zampini 
4536ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4537ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4538ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4539ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4540ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4541ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4542ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4543ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4544ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4545ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4546ed502f03SStefano Zampini     if (c->nz) {
45472ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
45482ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
45492ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
45502ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
45512ed87e7eSStefano Zampini 
4552ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4553ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4554ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4555ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4556ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4557ed502f03SStefano Zampini         }
45582ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
45592ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4560ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4561ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4562ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4563ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4564ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4565ed502f03SStefano Zampini         }
45662ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
45672ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4568ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
45692ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
45702ed87e7eSStefano Zampini                               Aroff->data().get(),
45712ed87e7eSStefano Zampini                               Annz,
45722ed87e7eSStefano Zampini                               m,
45732ed87e7eSStefano Zampini                               Acoo->data().get(),
45742ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4575ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
45762ed87e7eSStefano Zampini                               Broff->data().get(),
4577ed502f03SStefano Zampini                               Bnnz,
4578ed502f03SStefano Zampini                               m,
45792ed87e7eSStefano Zampini                               Bcoo->data().get(),
4580ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
45812ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
45822ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
45832ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
45848909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4585ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4586ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
45878909a122SStefano Zampini #else
45888909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
45898909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
45908909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
45918909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
45928909a122SStefano Zampini #endif
45932ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
45942ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
45952ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
45962ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
45972ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
45982ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4599ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4600ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4601ed502f03SStefano Zampini       thrust::advance(p2,Annz);
46022ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
46038909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
46048909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
46058909a122SStefano Zampini #endif
46062ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
46072ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
46082ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
46092ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
46102ed87e7eSStefano Zampini #else
46112ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
46122ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
46132ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
46142ed87e7eSStefano Zampini #endif
4615ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
46162ed87e7eSStefano Zampini                               Ccoo->data().get(),
4617ed502f03SStefano Zampini                               c->nz,
4618ed502f03SStefano Zampini                               m,
4619ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4620ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4621ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
46222ed87e7eSStefano Zampini       delete wPerm;
46232ed87e7eSStefano Zampini       delete Acoo;
46242ed87e7eSStefano Zampini       delete Bcoo;
46252ed87e7eSStefano Zampini       delete Ccoo;
4626ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4627ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4628ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4629ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4630ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4631ed502f03SStefano Zampini #endif
46321a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
46333606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
46343606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4635ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4636ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4637ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4638ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4639ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4640ed502f03SStefano Zampini 
46411a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
46421a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4643a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4644ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4645ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4646ed502f03SStefano Zampini         CcsrT->num_rows = n;
4647ed502f03SStefano Zampini         CcsrT->num_cols = m;
4648ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4649ed502f03SStefano Zampini 
4650ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4651ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4652ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4653ed502f03SStefano Zampini 
4654ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4655ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4656ed502f03SStefano Zampini         if (AT) {
4657ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4658ed502f03SStefano Zampini           thrust::advance(rT,-1);
4659ed502f03SStefano Zampini         }
4660ed502f03SStefano Zampini         if (BT) {
4661ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4662ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4663ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4664ed502f03SStefano Zampini         }
4665ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4666ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4667ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4668ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4669ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4670ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4671ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4672ed502f03SStefano Zampini 
4673ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4674ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4675ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4676ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4677ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4678ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4679ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4680ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4681ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4682ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4683ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4684ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4685ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4686ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4687ed502f03SStefano Zampini #endif
4688ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4689ed502f03SStefano Zampini       }
4690ed502f03SStefano Zampini     }
4691ed502f03SStefano Zampini 
4692ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4693ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4694ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4695ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4696ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4697ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4698ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4699ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4700ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4701ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4702ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4703ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4704ed502f03SStefano Zampini     } else {
4705ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4706ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4707ed502f03SStefano Zampini     }
4708ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4709ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4710ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4711ed502f03SStefano Zampini     c->maxnz = c->nz;
4712ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4713ed502f03SStefano Zampini     c->rmax = 0;
4714ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4715ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4716ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4717ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4718ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4719ed502f03SStefano Zampini     }
4720ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4721ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4722ed502f03SStefano Zampini     (*C)->nonzerostate++;
4723ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4724ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4725ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4726ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4727ed502f03SStefano Zampini   } else {
47282c71b3e2SJacob Faibussowitsch     PetscCheckFalse((*C)->rmap->n != B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4729ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4730ed502f03SStefano Zampini     if (c->nz) {
4731ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
47322c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!Ccusp->cooPerm,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
47332c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB,PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
47342c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccusp->nonzerostate != (*C)->nonzerostate,PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4735ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4736ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
47372c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!Acusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
47382c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!Bcusp->mat,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4739ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4740ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4741ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
47422c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Acsr->num_entries != (PetscInt)Acsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
47432c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Bcsr->num_entries != (PetscInt)Bcsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
47442c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != (PetscInt)Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
47452c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries,PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
47462c71b3e2SJacob Faibussowitsch       PetscCheckFalse(Ccusp->cooPerm->size() != Ccsr->values->size(),PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4747ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4748ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4749ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4750ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4751ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4752ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4753ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4754ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4755ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4756ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4757ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4758ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4759ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4760a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
47611a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
47622c71b3e2SJacob Faibussowitsch         PetscCheckFalse(!Ccusp->matTranspose,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4763ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4764ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4765ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4766ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4767ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4768ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4769ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
47701a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4771ed502f03SStefano Zampini       }
4772ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4773ed502f03SStefano Zampini     }
4774ed502f03SStefano Zampini   }
4775ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4776ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4777ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4778ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4779ed502f03SStefano Zampini   PetscFunctionReturn(0);
4780ed502f03SStefano Zampini }
4781c215019aSStefano Zampini 
4782c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4783c215019aSStefano Zampini {
4784c215019aSStefano Zampini   PetscErrorCode    ierr;
4785c215019aSStefano Zampini   bool              dmem;
4786c215019aSStefano Zampini   const PetscScalar *av;
4787c215019aSStefano Zampini   cudaError_t       cerr;
4788c215019aSStefano Zampini 
4789c215019aSStefano Zampini   PetscFunctionBegin;
4790c215019aSStefano Zampini   dmem = isCudaMem(v);
4791c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4792c215019aSStefano Zampini   if (n && idx) {
4793c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4794c215019aSStefano Zampini     widx.assign(idx,idx+n);
4795c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4796c215019aSStefano Zampini 
4797c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4798c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4799c215019aSStefano Zampini     if (dmem) {
4800c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4801c215019aSStefano Zampini     } else {
4802c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4803c215019aSStefano Zampini       dv = w->data();
4804c215019aSStefano Zampini     }
4805c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4806c215019aSStefano Zampini 
4807c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4808c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4809c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4810c215019aSStefano Zampini     if (w) {
4811c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4812c215019aSStefano Zampini     }
4813c215019aSStefano Zampini     delete w;
4814c215019aSStefano Zampini   } else {
4815c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4816c215019aSStefano Zampini   }
4817c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4818c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4819c215019aSStefano Zampini   PetscFunctionReturn(0);
4820c215019aSStefano Zampini }
4821