xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 7d3de750dec08ee2edc7d15bcef3046c0443ab7d)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
8857181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
89a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9057181aedSStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92c215019aSStefano Zampini 
93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
94b06137fdSPaul Mullowney {
95b06137fdSPaul Mullowney   cusparseStatus_t   stat;
96b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97b06137fdSPaul Mullowney 
98b06137fdSPaul Mullowney   PetscFunctionBegin;
99d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10157d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102b06137fdSPaul Mullowney   PetscFunctionReturn(0);
103b06137fdSPaul Mullowney }
104b06137fdSPaul Mullowney 
105b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106b06137fdSPaul Mullowney {
107b06137fdSPaul Mullowney   cusparseStatus_t   stat;
108b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109b06137fdSPaul Mullowney 
110b06137fdSPaul Mullowney   PetscFunctionBegin;
111d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1126b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11316a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11457d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11516a2e217SAlejandro Lamas Daviña     }
116b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1176b1cf21dSAlejandro Lamas Daviña   }
11857d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119b06137fdSPaul Mullowney   PetscFunctionReturn(0);
120b06137fdSPaul Mullowney }
121b06137fdSPaul Mullowney 
122b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123b06137fdSPaul Mullowney {
124b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1257e8381f9SStefano Zampini   PetscBool          flg;
1267e8381f9SStefano Zampini   PetscErrorCode     ierr;
127ccdfe979SStefano Zampini 
128b06137fdSPaul Mullowney   PetscFunctionBegin;
1297e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1307e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
131ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132b06137fdSPaul Mullowney   PetscFunctionReturn(0);
133b06137fdSPaul Mullowney }
134b06137fdSPaul Mullowney 
135ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1369ae82921SPaul Mullowney {
1379ae82921SPaul Mullowney   PetscFunctionBegin;
1389ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1399ae82921SPaul Mullowney   PetscFunctionReturn(0);
1409ae82921SPaul Mullowney }
1419ae82921SPaul Mullowney 
142c708e6cdSJed Brown /*MC
143087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
149c708e6cdSJed Brown 
1509ae82921SPaul Mullowney   Level: beginner
151c708e6cdSJed Brown 
1523ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153c708e6cdSJed Brown M*/
1549ae82921SPaul Mullowney 
15542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1569ae82921SPaul Mullowney {
1579ae82921SPaul Mullowney   PetscErrorCode ierr;
158bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1599ae82921SPaul Mullowney 
1609ae82921SPaul Mullowney   PetscFunctionBegin;
161bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
162bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1632c7c0729SBarry Smith   (*B)->factortype = ftype;
1649ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1652205254eSKarl Rupp 
1669c1083e7SRichard Tran Mills   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
167087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16833d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1699c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1709ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1719ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1729c1083e7SRichard Tran Mills     } else {
1739c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1749c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1759c1083e7SRichard Tran Mills     }
1764ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1774ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1784ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
179087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1809c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
181087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
182087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1839c1083e7SRichard Tran Mills     } else {
1849c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1859c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1869c1083e7SRichard Tran Mills     }
1874ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1884ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1899ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
190bc3f50f2SPaul Mullowney 
191fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1924ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1933ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1949ae82921SPaul Mullowney   PetscFunctionReturn(0);
1959ae82921SPaul Mullowney }
1969ae82921SPaul Mullowney 
197bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
198ca45077fSPaul Mullowney {
199aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2006e111a19SKarl Rupp 
201ca45077fSPaul Mullowney   PetscFunctionBegin;
202ca45077fSPaul Mullowney   switch (op) {
203e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
204aa372e3fSPaul Mullowney     cusparsestruct->format = format;
205ca45077fSPaul Mullowney     break;
206e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
207aa372e3fSPaul Mullowney     cusparsestruct->format = format;
208ca45077fSPaul Mullowney     break;
209ca45077fSPaul Mullowney   default:
21098921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
211ca45077fSPaul Mullowney   }
212ca45077fSPaul Mullowney   PetscFunctionReturn(0);
213ca45077fSPaul Mullowney }
2149ae82921SPaul Mullowney 
215e057df02SPaul Mullowney /*@
216e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
217e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
218aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
219e057df02SPaul Mullowney    Not Collective
220e057df02SPaul Mullowney 
221e057df02SPaul Mullowney    Input Parameters:
2228468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
22336d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2242692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
225e057df02SPaul Mullowney 
226e057df02SPaul Mullowney    Output Parameter:
227e057df02SPaul Mullowney 
228e057df02SPaul Mullowney    Level: intermediate
229e057df02SPaul Mullowney 
2308468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
231e057df02SPaul Mullowney @*/
232e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
233e057df02SPaul Mullowney {
234e057df02SPaul Mullowney   PetscErrorCode ierr;
2356e111a19SKarl Rupp 
236e057df02SPaul Mullowney   PetscFunctionBegin;
237e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
238e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
239e057df02SPaul Mullowney   PetscFunctionReturn(0);
240e057df02SPaul Mullowney }
241e057df02SPaul Mullowney 
242365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
243365b711fSMark Adams {
244365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
245365b711fSMark Adams 
246365b711fSMark Adams   PetscFunctionBegin;
247365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
248365b711fSMark Adams   PetscFunctionReturn(0);
249365b711fSMark Adams }
250365b711fSMark Adams 
251365b711fSMark Adams /*@
252365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
253365b711fSMark Adams 
254365b711fSMark Adams    Input Parameters:
255365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
256365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
257365b711fSMark Adams 
258365b711fSMark Adams    Output Parameter:
259365b711fSMark Adams 
260365b711fSMark Adams    Notes:
261365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
262365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
263365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
264365b711fSMark Adams 
265365b711fSMark Adams    Level: intermediate
266365b711fSMark Adams 
267365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
268365b711fSMark Adams @*/
269365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
270365b711fSMark Adams {
271365b711fSMark Adams   PetscErrorCode ierr;
272365b711fSMark Adams 
273365b711fSMark Adams   PetscFunctionBegin;
274365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
275365b711fSMark Adams   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
276365b711fSMark Adams   PetscFunctionReturn(0);
277365b711fSMark Adams }
278365b711fSMark Adams 
2791a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
280e6e9a74fSStefano Zampini {
281e6e9a74fSStefano Zampini   PetscErrorCode ierr;
282e6e9a74fSStefano Zampini 
283e6e9a74fSStefano Zampini   PetscFunctionBegin;
2841a2c6b5cSJunchao Zhang   switch (op) {
2851a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2861a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2871a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2881a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2891a2c6b5cSJunchao Zhang       break;
2901a2c6b5cSJunchao Zhang     default:
2911a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2921a2c6b5cSJunchao Zhang       break;
293e6e9a74fSStefano Zampini   }
294e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
295e6e9a74fSStefano Zampini }
296e6e9a74fSStefano Zampini 
297bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
298bddcd29dSMark Adams 
299bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
300bddcd29dSMark Adams {
301bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
302bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
303bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
304365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
305bddcd29dSMark Adams   PetscErrorCode ierr;
306bddcd29dSMark Adams 
307bddcd29dSMark Adams   PetscFunctionBegin;
308bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
309bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
310bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
311bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
312bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
313bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
314bddcd29dSMark Adams   if (row_identity && col_identity) {
315365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
316bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
317bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
318365b711fSMark Adams     }
319bddcd29dSMark Adams     B->ops->matsolve = NULL;
320bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
321bddcd29dSMark Adams   } else {
322365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
323bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
324bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
325365b711fSMark Adams     }
326bddcd29dSMark Adams     B->ops->matsolve = NULL;
327bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
328bddcd29dSMark Adams   }
329bddcd29dSMark Adams 
330bddcd29dSMark Adams   /* get the triangular factors */
331365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
332bddcd29dSMark Adams     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
333365b711fSMark Adams   }
334bddcd29dSMark Adams   PetscFunctionReturn(0);
335bddcd29dSMark Adams }
336bddcd29dSMark Adams 
3374416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
3389ae82921SPaul Mullowney {
3399ae82921SPaul Mullowney   PetscErrorCode           ierr;
340e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
3419ae82921SPaul Mullowney   PetscBool                flg;
342a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3436e111a19SKarl Rupp 
3449ae82921SPaul Mullowney   PetscFunctionBegin;
345e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
3469ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
347e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
348a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
349afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
350afb2bd1cSJunchao Zhang 
3514c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
352a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
353afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
354365b711fSMark Adams     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
355365b711fSMark Adams     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
356afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
357afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
358afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
359afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
360a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
361a435da06SStefano Zampini     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
362a435da06SStefano Zampini #else
363afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
364a435da06SStefano Zampini #endif
365afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
366afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
367afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
368afb2bd1cSJunchao Zhang 
369afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
370afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
371afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
372afb2bd1cSJunchao Zhang    #endif
3734c87dfd4SPaul Mullowney   }
3740af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3759ae82921SPaul Mullowney   PetscFunctionReturn(0);
3769ae82921SPaul Mullowney }
3779ae82921SPaul Mullowney 
3786fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3799ae82921SPaul Mullowney {
380da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3819ae82921SPaul Mullowney   PetscErrorCode               ierr;
3829ae82921SPaul Mullowney 
3839ae82921SPaul Mullowney   PetscFunctionBegin;
384da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3859ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3869ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3879ae82921SPaul Mullowney   PetscFunctionReturn(0);
3889ae82921SPaul Mullowney }
3899ae82921SPaul Mullowney 
3906fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3919ae82921SPaul Mullowney {
392da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3939ae82921SPaul Mullowney   PetscErrorCode               ierr;
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney   PetscFunctionBegin;
396da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3979ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3989ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3999ae82921SPaul Mullowney   PetscFunctionReturn(0);
4009ae82921SPaul Mullowney }
4019ae82921SPaul Mullowney 
402087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
403087f3262SPaul Mullowney {
404da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
405087f3262SPaul Mullowney   PetscErrorCode               ierr;
406087f3262SPaul Mullowney 
407087f3262SPaul Mullowney   PetscFunctionBegin;
408da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
409087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
410087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
411087f3262SPaul Mullowney   PetscFunctionReturn(0);
412087f3262SPaul Mullowney }
413087f3262SPaul Mullowney 
414087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
415087f3262SPaul Mullowney {
416da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
417087f3262SPaul Mullowney   PetscErrorCode               ierr;
418087f3262SPaul Mullowney 
419087f3262SPaul Mullowney   PetscFunctionBegin;
420da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
421087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
422087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
423087f3262SPaul Mullowney   PetscFunctionReturn(0);
424087f3262SPaul Mullowney }
425087f3262SPaul Mullowney 
426087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
4279ae82921SPaul Mullowney {
4289ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4299ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
431aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
4329ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4339ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
4349ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4359ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
4369ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
437b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
43857d48284SJunchao Zhang   cudaError_t                       cerr;
4399ae82921SPaul Mullowney 
4409ae82921SPaul Mullowney   PetscFunctionBegin;
441cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
442c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4439ae82921SPaul Mullowney     try {
4449ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
4459ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
446da79fbbcSStefano Zampini       if (!loTriFactor) {
4472cbc15d9SMark         PetscScalar                       *AALo;
4482cbc15d9SMark 
4492cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4509ae82921SPaul Mullowney 
4519ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
45257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
45357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
4549ae82921SPaul Mullowney 
4559ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4569ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4579ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4589ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4599ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4609ae82921SPaul Mullowney         v        = aa;
4619ae82921SPaul Mullowney         vi       = aj;
4629ae82921SPaul Mullowney         offset   = 1;
4639ae82921SPaul Mullowney         rowOffset= 1;
4649ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4659ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
466e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4679ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4689ae82921SPaul Mullowney           rowOffset += nz+1;
4699ae82921SPaul Mullowney 
470580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
471580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4729ae82921SPaul Mullowney 
4739ae82921SPaul Mullowney           offset      += nz;
4749ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4759ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4769ae82921SPaul Mullowney           offset      += 1;
4779ae82921SPaul Mullowney 
4789ae82921SPaul Mullowney           v  += nz;
4799ae82921SPaul Mullowney           vi += nz;
4809ae82921SPaul Mullowney         }
4812205254eSKarl Rupp 
482aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
483da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
484da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
485aa372e3fSPaul Mullowney         /* Create the matrix description */
48657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
48757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4881b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
489afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
490afb2bd1cSJunchao Zhang        #else
49157d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
492afb2bd1cSJunchao Zhang        #endif
49357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
49457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
495aa372e3fSPaul Mullowney 
496aa372e3fSPaul Mullowney         /* set the operation */
497aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
498aa372e3fSPaul Mullowney 
499aa372e3fSPaul Mullowney         /* set the matrix */
500aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
501aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
502aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
503aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
504aa372e3fSPaul Mullowney 
505aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
506aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
507aa372e3fSPaul Mullowney 
508aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
509aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
510aa372e3fSPaul Mullowney 
511aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
512aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
513aa372e3fSPaul Mullowney 
514afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
515da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
516afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5171b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
518afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
519afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
520afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
521afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
522afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
523afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
524afb2bd1cSJunchao Zhang       #endif
525afb2bd1cSJunchao Zhang 
526aa372e3fSPaul Mullowney         /* perform the solve analysis */
527aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
528aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
529aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
530d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
5311b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
532d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
533d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
534d49cd2b7SBarry Smith                                #else
535d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
536afb2bd1cSJunchao Zhang                                #endif
537da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
538da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
539aa372e3fSPaul Mullowney 
540da79fbbcSStefano Zampini         /* assign the pointer */
541aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
5422cbc15d9SMark         loTriFactor->AA_h = AALo;
54357d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
54457d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
5454863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
546da79fbbcSStefano Zampini       } else { /* update values only */
5472cbc15d9SMark         if (!loTriFactor->AA_h) {
5482cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
5492cbc15d9SMark         }
550da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
5512cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
552da79fbbcSStefano Zampini         v        = aa;
553da79fbbcSStefano Zampini         vi       = aj;
554da79fbbcSStefano Zampini         offset   = 1;
555da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
556da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5572cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
558da79fbbcSStefano Zampini           offset      += nz;
5592cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
560da79fbbcSStefano Zampini           offset      += 1;
561da79fbbcSStefano Zampini           v  += nz;
562da79fbbcSStefano Zampini         }
5632cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
564da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
565da79fbbcSStefano Zampini       }
5669ae82921SPaul Mullowney     } catch(char *ex) {
56798921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5689ae82921SPaul Mullowney     }
5699ae82921SPaul Mullowney   }
5709ae82921SPaul Mullowney   PetscFunctionReturn(0);
5719ae82921SPaul Mullowney }
5729ae82921SPaul Mullowney 
573087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5749ae82921SPaul Mullowney {
5759ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5769ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5779ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
578aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5799ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5809ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5819ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5829ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5839ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5849ae82921SPaul Mullowney   PetscErrorCode                    ierr;
58557d48284SJunchao Zhang   cudaError_t                       cerr;
5869ae82921SPaul Mullowney 
5879ae82921SPaul Mullowney   PetscFunctionBegin;
588cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
589c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5909ae82921SPaul Mullowney     try {
5919ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5929ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
593da79fbbcSStefano Zampini       if (!upTriFactor) {
5942cbc15d9SMark         PetscScalar *AAUp;
5952cbc15d9SMark 
5962cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5972cbc15d9SMark 
5989ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
59957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
60057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
6019ae82921SPaul Mullowney 
6029ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
6039ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
6049ae82921SPaul Mullowney         AiUp[n]=nzUpper;
6059ae82921SPaul Mullowney         offset = nzUpper;
6069ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
6079ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
6089ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
6099ae82921SPaul Mullowney 
610e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
6119ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
6129ae82921SPaul Mullowney 
613e057df02SPaul Mullowney           /* decrement the offset */
6149ae82921SPaul Mullowney           offset -= (nz+1);
6159ae82921SPaul Mullowney 
616e057df02SPaul Mullowney           /* first, set the diagonal elements */
6179ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
61809f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
6199ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
6209ae82921SPaul Mullowney 
621580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
622580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
6239ae82921SPaul Mullowney         }
6242205254eSKarl Rupp 
625aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
626da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
627da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
6282205254eSKarl Rupp 
629aa372e3fSPaul Mullowney         /* Create the matrix description */
63057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
63157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
6321b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
633afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
634afb2bd1cSJunchao Zhang        #else
63557d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
636afb2bd1cSJunchao Zhang        #endif
63757d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
63857d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
639aa372e3fSPaul Mullowney 
640aa372e3fSPaul Mullowney         /* set the operation */
641aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
642aa372e3fSPaul Mullowney 
643aa372e3fSPaul Mullowney         /* set the matrix */
644aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
645aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
646aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
647aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
648aa372e3fSPaul Mullowney 
649aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
650aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
651aa372e3fSPaul Mullowney 
652aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
653aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
654aa372e3fSPaul Mullowney 
655aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
656aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
657aa372e3fSPaul Mullowney 
658afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
659da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
660afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6611b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
662afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
663afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
664afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
665afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
666afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
667afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
668afb2bd1cSJunchao Zhang       #endif
669afb2bd1cSJunchao Zhang 
670aa372e3fSPaul Mullowney         /* perform the solve analysis */
671aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
672aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
673aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
674d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6751b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
676d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
677d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
678d49cd2b7SBarry Smith                                #else
679d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
680afb2bd1cSJunchao Zhang                                #endif
681da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
682da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
683aa372e3fSPaul Mullowney 
684da79fbbcSStefano Zampini         /* assign the pointer */
685aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6862cbc15d9SMark         upTriFactor->AA_h = AAUp;
68757d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
68857d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6894863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
690da79fbbcSStefano Zampini       } else {
6912cbc15d9SMark         if (!upTriFactor->AA_h) {
6922cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6932cbc15d9SMark         }
694da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
695da79fbbcSStefano Zampini         offset = nzUpper;
696da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
697da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
698da79fbbcSStefano Zampini 
699da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
700da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
701da79fbbcSStefano Zampini 
702da79fbbcSStefano Zampini           /* decrement the offset */
703da79fbbcSStefano Zampini           offset -= (nz+1);
704da79fbbcSStefano Zampini 
705da79fbbcSStefano Zampini           /* first, set the diagonal elements */
7062cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
7072cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
708da79fbbcSStefano Zampini         }
7092cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
710da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
711da79fbbcSStefano Zampini       }
7129ae82921SPaul Mullowney     } catch(char *ex) {
71398921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7149ae82921SPaul Mullowney     }
7159ae82921SPaul Mullowney   }
7169ae82921SPaul Mullowney   PetscFunctionReturn(0);
7179ae82921SPaul Mullowney }
7189ae82921SPaul Mullowney 
719087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
7209ae82921SPaul Mullowney {
7219ae82921SPaul Mullowney   PetscErrorCode               ierr;
7229ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
7239ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7249ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
7259ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
7269ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
7279ae82921SPaul Mullowney 
7289ae82921SPaul Mullowney   PetscFunctionBegin;
729da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
730087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
731087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
7322205254eSKarl Rupp 
733da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
734aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
7359ae82921SPaul Mullowney 
736c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
737e057df02SPaul Mullowney   /* lower triangular indices */
7389ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
739da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
740da79fbbcSStefano Zampini     const PetscInt *r;
741da79fbbcSStefano Zampini 
742da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
743aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
744aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
7459ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
746da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
747da79fbbcSStefano Zampini   }
7489ae82921SPaul Mullowney 
749e057df02SPaul Mullowney   /* upper triangular indices */
7509ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
751da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
752da79fbbcSStefano Zampini     const PetscInt *c;
753da79fbbcSStefano Zampini 
754da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
755aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
756aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
7579ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
758da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
759da79fbbcSStefano Zampini   }
7609ae82921SPaul Mullowney   PetscFunctionReturn(0);
7619ae82921SPaul Mullowney }
7629ae82921SPaul Mullowney 
763087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
764087f3262SPaul Mullowney {
765087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
766087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
767aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
768aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
769087f3262SPaul Mullowney   cusparseStatus_t                  stat;
770087f3262SPaul Mullowney   PetscErrorCode                    ierr;
77157d48284SJunchao Zhang   cudaError_t                       cerr;
772087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
773087f3262SPaul Mullowney   PetscScalar                       *AAUp;
774087f3262SPaul Mullowney   PetscScalar                       *AALo;
775087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
776087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
777087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
778087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
779087f3262SPaul Mullowney 
780087f3262SPaul Mullowney   PetscFunctionBegin;
781cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
782c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
783087f3262SPaul Mullowney     try {
784da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
785da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
786da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
787087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
78857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
78957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
790087f3262SPaul Mullowney 
791087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
792087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
793087f3262SPaul Mullowney         AiUp[n]=nzUpper;
794087f3262SPaul Mullowney         offset = 0;
795087f3262SPaul Mullowney         for (i=0; i<n; i++) {
796087f3262SPaul Mullowney           /* set the pointers */
797087f3262SPaul Mullowney           v  = aa + ai[i];
798087f3262SPaul Mullowney           vj = aj + ai[i];
799087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
800087f3262SPaul Mullowney 
801087f3262SPaul Mullowney           /* first, set the diagonal elements */
802087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
80309f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
804087f3262SPaul Mullowney           AiUp[i]      = offset;
80509f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
806087f3262SPaul Mullowney 
807087f3262SPaul Mullowney           offset+=1;
808087f3262SPaul Mullowney           if (nz>0) {
809f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
810580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
811087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
812087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
813087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
814087f3262SPaul Mullowney             }
815087f3262SPaul Mullowney             offset+=nz;
816087f3262SPaul Mullowney           }
817087f3262SPaul Mullowney         }
818087f3262SPaul Mullowney 
819aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
820da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
821da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
822087f3262SPaul Mullowney 
823aa372e3fSPaul Mullowney         /* Create the matrix description */
82457d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
82557d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8261b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
827afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
828afb2bd1cSJunchao Zhang        #else
82957d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
830afb2bd1cSJunchao Zhang        #endif
83157d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83257d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
833087f3262SPaul Mullowney 
834aa372e3fSPaul Mullowney         /* set the matrix */
835aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
836aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
837aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
838aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
839aa372e3fSPaul Mullowney 
840aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
841aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
842aa372e3fSPaul Mullowney 
843aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
845aa372e3fSPaul Mullowney 
846aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
848aa372e3fSPaul Mullowney 
849afb2bd1cSJunchao Zhang         /* set the operation */
850afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
851afb2bd1cSJunchao Zhang 
852afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
853da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
854afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8551b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
856afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
857afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
858afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
859afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
860afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
861afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
862afb2bd1cSJunchao Zhang       #endif
863afb2bd1cSJunchao Zhang 
864aa372e3fSPaul Mullowney         /* perform the solve analysis */
865aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
866aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
867aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
868d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8691b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
870d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
871d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
872d49cd2b7SBarry Smith                                 #else
873d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
874afb2bd1cSJunchao Zhang                                 #endif
875da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
876da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
877aa372e3fSPaul Mullowney 
878da79fbbcSStefano Zampini         /* assign the pointer */
879aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
880aa372e3fSPaul Mullowney 
881aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
882da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
883da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
884aa372e3fSPaul Mullowney 
885aa372e3fSPaul Mullowney         /* Create the matrix description */
88657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
88757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8881b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
889afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
890afb2bd1cSJunchao Zhang        #else
89157d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
892afb2bd1cSJunchao Zhang        #endif
89357d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
89457d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
895aa372e3fSPaul Mullowney 
896aa372e3fSPaul Mullowney         /* set the operation */
897aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
898aa372e3fSPaul Mullowney 
899aa372e3fSPaul Mullowney         /* set the matrix */
900aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
901aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
904aa372e3fSPaul Mullowney 
905aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
907aa372e3fSPaul Mullowney 
908aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
909aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
910aa372e3fSPaul Mullowney 
911aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
912aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
913aa372e3fSPaul Mullowney 
914afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
915da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
916afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
9171b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
918afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
919afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
920afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
921afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
922afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
923afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
924afb2bd1cSJunchao Zhang       #endif
925afb2bd1cSJunchao Zhang 
926aa372e3fSPaul Mullowney         /* perform the solve analysis */
927aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
928aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
929aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
930d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
9311b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
932d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
933d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
934d49cd2b7SBarry Smith                                 #else
935d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
936afb2bd1cSJunchao Zhang                                 #endif
937da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
938da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
939aa372e3fSPaul Mullowney 
940da79fbbcSStefano Zampini         /* assign the pointer */
941aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
942087f3262SPaul Mullowney 
943da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
94457d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
94557d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
946da79fbbcSStefano Zampini       } else {
947da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
948da79fbbcSStefano Zampini         offset = 0;
949da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
950da79fbbcSStefano Zampini           /* set the pointers */
951da79fbbcSStefano Zampini           v  = aa + ai[i];
952da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
953da79fbbcSStefano Zampini 
954da79fbbcSStefano Zampini           /* first, set the diagonal elements */
955da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
956da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
957da79fbbcSStefano Zampini 
958da79fbbcSStefano Zampini           offset+=1;
959da79fbbcSStefano Zampini           if (nz>0) {
960da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
961da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
962da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
963da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
964da79fbbcSStefano Zampini             }
965da79fbbcSStefano Zampini             offset+=nz;
966da79fbbcSStefano Zampini           }
967da79fbbcSStefano Zampini         }
968da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
969da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
970da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
971da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
972da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
973da79fbbcSStefano Zampini       }
97457d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
97557d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
976087f3262SPaul Mullowney     } catch(char *ex) {
97798921bdaSJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
978087f3262SPaul Mullowney     }
979087f3262SPaul Mullowney   }
980087f3262SPaul Mullowney   PetscFunctionReturn(0);
981087f3262SPaul Mullowney }
982087f3262SPaul Mullowney 
983087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9849ae82921SPaul Mullowney {
9859ae82921SPaul Mullowney   PetscErrorCode               ierr;
986087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
987087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
988087f3262SPaul Mullowney   IS                           ip = a->row;
989087f3262SPaul Mullowney   PetscBool                    perm_identity;
990087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
991087f3262SPaul Mullowney 
992087f3262SPaul Mullowney   PetscFunctionBegin;
993da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
994087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
995da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
996aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
997aa372e3fSPaul Mullowney 
998da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
999da79fbbcSStefano Zampini 
1000087f3262SPaul Mullowney   /* lower triangular indices */
1001087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1002087f3262SPaul Mullowney   if (!perm_identity) {
10034e4bbfaaSStefano Zampini     IS             iip;
1004da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
10054e4bbfaaSStefano Zampini 
10064e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
10074e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1008da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1009aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1010aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1011aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10124e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
10134e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
10144e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1015087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1016da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1017da79fbbcSStefano Zampini   }
1018087f3262SPaul Mullowney   PetscFunctionReturn(0);
1019087f3262SPaul Mullowney }
1020087f3262SPaul Mullowney 
1021087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1022087f3262SPaul Mullowney {
1023087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1024087f3262SPaul Mullowney   IS             ip = b->row;
1025087f3262SPaul Mullowney   PetscBool      perm_identity;
1026b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1027087f3262SPaul Mullowney 
1028087f3262SPaul Mullowney   PetscFunctionBegin;
102957181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1030087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1031ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1032087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1033087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1034087f3262SPaul Mullowney   if (perm_identity) {
1035087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1036087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
10374e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10384e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1039087f3262SPaul Mullowney   } else {
1040087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1041087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
10424e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10434e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1044087f3262SPaul Mullowney   }
1045087f3262SPaul Mullowney 
1046087f3262SPaul Mullowney   /* get the triangular factors */
1047087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1048087f3262SPaul Mullowney   PetscFunctionReturn(0);
1049087f3262SPaul Mullowney }
10509ae82921SPaul Mullowney 
1051b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1052bda325fcSPaul Mullowney {
1053bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1054aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1055aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1056da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1057da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1058bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1059aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1060aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1061aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1062aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10631b0a6780SStefano Zampini   cudaError_t                       cerr;
1064da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1065b175d8bbSPaul Mullowney 
1066bda325fcSPaul Mullowney   PetscFunctionBegin;
1067aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1068da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1069da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1070aa372e3fSPaul Mullowney 
1071aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1072aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1073aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1074aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1075aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1076aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1077aa372e3fSPaul Mullowney 
1078aa372e3fSPaul Mullowney   /* Create the matrix description */
107957d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
108057d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
108157d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
108257d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
108357d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1084aa372e3fSPaul Mullowney 
1085aa372e3fSPaul Mullowney   /* set the operation */
1086aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1087aa372e3fSPaul Mullowney 
1088aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1089aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1090afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1091afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1092aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1093afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1094afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1095afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1096aa372e3fSPaul Mullowney 
1097aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1098afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1099afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1100afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1101afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1102afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1103afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1104afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1105afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1106afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1107afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
11081b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1109afb2bd1cSJunchao Zhang #endif
1110afb2bd1cSJunchao Zhang 
1111da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1112aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1113aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1114aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1115aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1116aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1117aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1118afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1119afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1120afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1121d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1122afb2bd1cSJunchao Zhang                         #else
1123afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1124d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1125afb2bd1cSJunchao Zhang                         #endif
1126da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1127da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1128aa372e3fSPaul Mullowney 
1129afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1130da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1131afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1133afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1134afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1135afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1136afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1137afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1138afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1139afb2bd1cSJunchao Zhang #endif
1140afb2bd1cSJunchao Zhang 
1141afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1142aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1143afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1144afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1145d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
11461b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1147d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1148d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1149d49cd2b7SBarry Smith                           #else
1150d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1151afb2bd1cSJunchao Zhang                           #endif
1152da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1153da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1154aa372e3fSPaul Mullowney 
1155da79fbbcSStefano Zampini   /* assign the pointer */
1156aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1157aa372e3fSPaul Mullowney 
1158aa372e3fSPaul Mullowney   /*********************************************/
1159aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1160aa372e3fSPaul Mullowney   /*********************************************/
1161aa372e3fSPaul Mullowney 
1162aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1163da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1164da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1165aa372e3fSPaul Mullowney 
1166aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1167aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1168aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1169aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1170aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1171aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1172aa372e3fSPaul Mullowney 
1173aa372e3fSPaul Mullowney   /* Create the matrix description */
117457d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
117557d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
117657d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
117757d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
117857d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1179aa372e3fSPaul Mullowney 
1180aa372e3fSPaul Mullowney   /* set the operation */
1181aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1182aa372e3fSPaul Mullowney 
1183aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1184aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1185afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1186afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1187aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1188afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1189afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1190afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1191aa372e3fSPaul Mullowney 
1192aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1193afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1194afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1195afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1196afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1197afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1198afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1199afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1200afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1201afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1202afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1203afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1204afb2bd1cSJunchao Zhang #endif
1205afb2bd1cSJunchao Zhang 
1206da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1207aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1208aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1209aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1210aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1211aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1212aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1213afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1214afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1215afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1216d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1217afb2bd1cSJunchao Zhang                         #else
1218afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1219d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1220afb2bd1cSJunchao Zhang                         #endif
1221d49cd2b7SBarry Smith 
1222da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1223da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1224aa372e3fSPaul Mullowney 
1225afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1226da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1227afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
12281b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1229afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1230afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1231afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1232afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1233afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1234afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1235afb2bd1cSJunchao Zhang   #endif
1236afb2bd1cSJunchao Zhang 
1237afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1238aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1239afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1240afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1241d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
12421b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1243d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1244d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1245d49cd2b7SBarry Smith                           #else
1246d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1247afb2bd1cSJunchao Zhang                           #endif
1248d49cd2b7SBarry Smith 
1249da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1250da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1251aa372e3fSPaul Mullowney 
1252da79fbbcSStefano Zampini   /* assign the pointer */
1253aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1254bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1255bda325fcSPaul Mullowney }
1256bda325fcSPaul Mullowney 
1257a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1258a49f1ed0SStefano Zampini {
1259a49f1ed0SStefano Zampini   __host__ __device__
1260a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1261a49f1ed0SStefano Zampini   {
1262a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1263a49f1ed0SStefano Zampini   }
1264a49f1ed0SStefano Zampini };
1265a49f1ed0SStefano Zampini 
12663606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1267bda325fcSPaul Mullowney {
1268aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1269a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1270bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1271bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1272aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1273b06137fdSPaul Mullowney   cudaError_t                  err;
127485ba7357SStefano Zampini   PetscErrorCode               ierr;
1275b175d8bbSPaul Mullowney 
1276bda325fcSPaul Mullowney   PetscFunctionBegin;
1277a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1278a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1279e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1280a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1281e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12821a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
128385ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1284ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1285a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1286a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1287a49f1ed0SStefano Zampini   }
1288a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1289aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
129057d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1291aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
129257d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
129357d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1294aa372e3fSPaul Mullowney 
1295b06137fdSPaul Mullowney     /* set alpha and beta */
1296afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12977656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12987656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1299afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13007656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13017656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1302b06137fdSPaul Mullowney 
1303aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1304aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1305a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1306554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1307554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1308aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1309a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1310aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1311aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1312a3fdcf43SKarl Rupp 
1313039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
131481902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1315afb2bd1cSJunchao Zhang 
1316afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13173606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1318afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1319afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1320afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1321afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1322afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1323afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13243606e59fSJunchao Zhang       #else
13253606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
13263606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
13273606e59fSJunchao Zhang 
13283606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
13293606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
13303606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
13313606e59fSJunchao Zhang         */
13323606e59fSJunchao Zhang         if (matrixT->num_entries) {
13333606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
13343606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
13353606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
13363606e59fSJunchao Zhang                                  matrixT->values->data().get(),
13373606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
13383606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13393606e59fSJunchao Zhang 
13403606e59fSJunchao Zhang         } else {
13413606e59fSJunchao Zhang           matstructT->matDescr = NULL;
13423606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13433606e59fSJunchao Zhang         }
13443606e59fSJunchao Zhang       #endif
1345afb2bd1cSJunchao Zhang      #endif
1346aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1347afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1348afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1349afb2bd1cSJunchao Zhang    #else
1350aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
135151c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
135251c6d536SStefano Zampini       /* First convert HYB to CSR */
1353aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1354aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1355aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1356aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1357aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1358aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1359aa372e3fSPaul Mullowney 
1360aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1361aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1362aa372e3fSPaul Mullowney                               temp->values->data().get(),
1363aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
136457d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1365aa372e3fSPaul Mullowney 
1366aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1367aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1368aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1369aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1370aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1371aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1372aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1373aa372e3fSPaul Mullowney 
1374aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1375aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1376aa372e3fSPaul Mullowney                               temp->values->data().get(),
1377aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1378aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1379aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1380aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1381aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
138257d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1383aa372e3fSPaul Mullowney 
1384aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1385aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
138657d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1387aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1388aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1389aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1390aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1391aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1392aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
139357d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1394aa372e3fSPaul Mullowney 
1395aa372e3fSPaul Mullowney       /* assign the pointer */
1396aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13971a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1398aa372e3fSPaul Mullowney       /* delete temporaries */
1399aa372e3fSPaul Mullowney       if (tempT) {
1400aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1401aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1402aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1403aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1404087f3262SPaul Mullowney       }
1405aa372e3fSPaul Mullowney       if (temp) {
1406aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1407aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1408aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1409aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1410aa372e3fSPaul Mullowney       }
1411afb2bd1cSJunchao Zhang      #endif
1412aa372e3fSPaul Mullowney     }
1413a49f1ed0SStefano Zampini   }
1414a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1415a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1416a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1417e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1418e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1419e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1420e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1421e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1422e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1423e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1424e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1425a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1426a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1427a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1428a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1429a49f1ed0SStefano Zampini     }
1430a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1431a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1432a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1433a49f1ed0SStefano Zampini 
1434a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1435a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1436a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1437a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1438a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1439a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1440a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1441a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1442a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1443a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1444a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1445a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1446a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1447a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1448a49f1ed0SStefano Zampini      #endif
1449a49f1ed0SStefano Zampini 
14501a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
14511a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
14521a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
14531a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
14541a2c6b5cSJunchao Zhang 
14551a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
14561a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
14571a2c6b5cSJunchao Zhang         */
14581a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14591a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14601a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14611a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14621a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1463a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1464a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1465a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1466a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14671a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1468a49f1ed0SStefano Zampini                              #else
1469a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14701a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1471a49f1ed0SStefano Zampini                              #endif
14721a2c6b5cSJunchao Zhang       } else {
14731a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14741a2c6b5cSJunchao Zhang       }
14751a2c6b5cSJunchao Zhang 
1476a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1477a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1478a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1479a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1480a49f1ed0SStefano Zampini      #endif
1481a49f1ed0SStefano Zampini     }
1482a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1483a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1484a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1485a49f1ed0SStefano Zampini   }
1486ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
148785ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1488213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1489213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1490aa372e3fSPaul Mullowney   /* assign the pointer */
1491aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14921a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1493bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1494bda325fcSPaul Mullowney }
1495bda325fcSPaul Mullowney 
1496a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14976fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1498bda325fcSPaul Mullowney {
1499c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1500465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1501465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1502465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1503465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1504bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1505bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1506aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1507aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1508aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1509b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1510bda325fcSPaul Mullowney 
1511bda325fcSPaul Mullowney   PetscFunctionBegin;
1512aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1513aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1514bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1515aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1516aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1517bda325fcSPaul Mullowney   }
1518bda325fcSPaul Mullowney 
1519bda325fcSPaul Mullowney   /* Get the GPU pointers */
1520c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1521c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1522c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1523c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1524bda325fcSPaul Mullowney 
15257a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1526aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1527a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1528c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1529c41cb2e2SAlejandro Lamas Daviña                xGPU);
1530aa372e3fSPaul Mullowney 
1531aa372e3fSPaul Mullowney   /* First, solve U */
1532aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1533afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15341b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1535afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1536afb2bd1cSJunchao Zhang                       #endif
1537afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1538aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1539aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1540aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1541aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1542d49cd2b7SBarry Smith                         xarray,
15431b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1544d49cd2b7SBarry Smith                         tempGPU->data().get(),
1545d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1546d49cd2b7SBarry Smith                       #else
1547d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1548afb2bd1cSJunchao Zhang                       #endif
1549aa372e3fSPaul Mullowney 
1550aa372e3fSPaul Mullowney   /* Then, solve L */
1551aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1552afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15531b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1554afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1555afb2bd1cSJunchao Zhang                       #endif
1556afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1557aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1558aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1559aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1560aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1561d49cd2b7SBarry Smith                         tempGPU->data().get(),
15621b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1563d49cd2b7SBarry Smith                         xarray,
1564d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1565d49cd2b7SBarry Smith                       #else
1566d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1567afb2bd1cSJunchao Zhang                       #endif
1568aa372e3fSPaul Mullowney 
1569aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1570a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1571c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1572aa372e3fSPaul Mullowney                tempGPU->begin());
1573aa372e3fSPaul Mullowney 
1574aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1575a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1576bda325fcSPaul Mullowney 
1577bda325fcSPaul Mullowney   /* restore */
1578c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1579c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1580661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1581958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1582bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1583bda325fcSPaul Mullowney }
1584bda325fcSPaul Mullowney 
15856fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1586bda325fcSPaul Mullowney {
1587465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1588465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1589bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1590bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1591aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1593aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1594b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1595bda325fcSPaul Mullowney 
1596bda325fcSPaul Mullowney   PetscFunctionBegin;
1597aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1598aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1599bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1600aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1601aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1602bda325fcSPaul Mullowney   }
1603bda325fcSPaul Mullowney 
1604bda325fcSPaul Mullowney   /* Get the GPU pointers */
1605c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1606c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1607bda325fcSPaul Mullowney 
16087a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1609aa372e3fSPaul Mullowney   /* First, solve U */
1610aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1611afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
16121b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1613afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1614afb2bd1cSJunchao Zhang                       #endif
1615afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1616aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1617aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1618aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1619aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1620d49cd2b7SBarry Smith                         barray,
16211b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1622d49cd2b7SBarry Smith                         tempGPU->data().get(),
1623d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1624d49cd2b7SBarry Smith                       #else
1625d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1626afb2bd1cSJunchao Zhang                       #endif
1627aa372e3fSPaul Mullowney 
1628aa372e3fSPaul Mullowney   /* Then, solve L */
1629aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1630afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
16311b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1632afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1633afb2bd1cSJunchao Zhang                       #endif
1634afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1635aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1636aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1637aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1638aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1639d49cd2b7SBarry Smith                         tempGPU->data().get(),
16401b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1641d49cd2b7SBarry Smith                         xarray,
1642d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1643d49cd2b7SBarry Smith                       #else
1644d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1645afb2bd1cSJunchao Zhang                       #endif
1646bda325fcSPaul Mullowney 
1647bda325fcSPaul Mullowney   /* restore */
1648c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1649c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1650661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1651958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1652bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1653bda325fcSPaul Mullowney }
1654bda325fcSPaul Mullowney 
16556fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
16569ae82921SPaul Mullowney {
1657465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1658465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1659465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1660465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16619ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16629ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1663aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1664aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1665aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1666b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
16679ae82921SPaul Mullowney 
16689ae82921SPaul Mullowney   PetscFunctionBegin;
1669ebc8f436SDominic Meiser 
1670e057df02SPaul Mullowney   /* Get the GPU pointers */
1671c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1672c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1673c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1674c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16759ae82921SPaul Mullowney 
16767a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1677aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1678a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1679c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16804e4bbfaaSStefano Zampini                tempGPU->begin());
1681aa372e3fSPaul Mullowney 
1682aa372e3fSPaul Mullowney   /* Next, solve L */
1683aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1684afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16851b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1686afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1687afb2bd1cSJunchao Zhang                       #endif
1688afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1689aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1690aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1691aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1692aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1693d49cd2b7SBarry Smith                         tempGPU->data().get(),
16941b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1695d49cd2b7SBarry Smith                          xarray,
1696d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1697d49cd2b7SBarry Smith                       #else
1698d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1699afb2bd1cSJunchao Zhang                       #endif
1700aa372e3fSPaul Mullowney 
1701aa372e3fSPaul Mullowney   /* Then, solve U */
1702aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1703afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17041b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1705afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1706afb2bd1cSJunchao Zhang                       #endif
1707afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1708aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1709aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1710aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1711d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
17121b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1713d49cd2b7SBarry Smith                         tempGPU->data().get(),
1714d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1715d49cd2b7SBarry Smith                       #else
1716d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1717afb2bd1cSJunchao Zhang                       #endif
1718d49cd2b7SBarry Smith 
17194e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1720a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
17214e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
17224e4bbfaaSStefano Zampini                xGPU);
17239ae82921SPaul Mullowney 
1724c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1725c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1726661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1727958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17289ae82921SPaul Mullowney   PetscFunctionReturn(0);
17299ae82921SPaul Mullowney }
17309ae82921SPaul Mullowney 
17316fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
17329ae82921SPaul Mullowney {
1733465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1734465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
17359ae82921SPaul Mullowney   cusparseStatus_t                  stat;
17369ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1737aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1738aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1739aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1740b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
17419ae82921SPaul Mullowney 
17429ae82921SPaul Mullowney   PetscFunctionBegin;
1743e057df02SPaul Mullowney   /* Get the GPU pointers */
1744c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1745c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
17469ae82921SPaul Mullowney 
17477a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1748aa372e3fSPaul Mullowney   /* First, solve L */
1749aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1750afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
17511b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1752afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1753afb2bd1cSJunchao Zhang                       #endif
1754afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1755aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1756aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1757aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1758aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1759d49cd2b7SBarry Smith                         barray,
17601b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1761d49cd2b7SBarry Smith                         tempGPU->data().get(),
1762d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1763d49cd2b7SBarry Smith                       #else
1764d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1765afb2bd1cSJunchao Zhang                       #endif
1766d49cd2b7SBarry Smith 
1767aa372e3fSPaul Mullowney   /* Next, solve U */
1768aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1769afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17701b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1771afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1772afb2bd1cSJunchao Zhang                       #endif
1773afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1774aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1775aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1776aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1777aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1778d49cd2b7SBarry Smith                         tempGPU->data().get(),
17791b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1780d49cd2b7SBarry Smith                         xarray,
1781d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1782d49cd2b7SBarry Smith                       #else
1783d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1784afb2bd1cSJunchao Zhang                       #endif
17859ae82921SPaul Mullowney 
1786c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1787c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1788661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1789958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17909ae82921SPaul Mullowney   PetscFunctionReturn(0);
17919ae82921SPaul Mullowney }
17929ae82921SPaul Mullowney 
17937e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17947e8381f9SStefano Zampini {
17957e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17967e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17977e8381f9SStefano Zampini   cudaError_t        cerr;
17987e8381f9SStefano Zampini   PetscErrorCode     ierr;
17997e8381f9SStefano Zampini 
18007e8381f9SStefano Zampini   PetscFunctionBegin;
18017e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
18027e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
18037e8381f9SStefano Zampini 
18047e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18057e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
18067e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
18077e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
18087e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18097e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
18107e8381f9SStefano Zampini   }
18117e8381f9SStefano Zampini   PetscFunctionReturn(0);
18127e8381f9SStefano Zampini }
18137e8381f9SStefano Zampini 
18147e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
18157e8381f9SStefano Zampini {
18167e8381f9SStefano Zampini   PetscErrorCode ierr;
18177e8381f9SStefano Zampini 
18187e8381f9SStefano Zampini   PetscFunctionBegin;
18197e8381f9SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
182067a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
182167a45760SJunchao Zhang   PetscFunctionReturn(0);
182267a45760SJunchao Zhang }
182367a45760SJunchao Zhang 
182467a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
182567a45760SJunchao Zhang {
182667a45760SJunchao Zhang   PetscFunctionBegin;
18277e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
182867a45760SJunchao Zhang   *array         = NULL;
182967a45760SJunchao Zhang   PetscFunctionReturn(0);
183067a45760SJunchao Zhang }
183167a45760SJunchao Zhang 
183267a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
183367a45760SJunchao Zhang {
183467a45760SJunchao Zhang   PetscErrorCode ierr;
183567a45760SJunchao Zhang 
183667a45760SJunchao Zhang   PetscFunctionBegin;
183767a45760SJunchao Zhang   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
183867a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
183967a45760SJunchao Zhang   PetscFunctionReturn(0);
184067a45760SJunchao Zhang }
184167a45760SJunchao Zhang 
184267a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
184367a45760SJunchao Zhang {
184467a45760SJunchao Zhang   PetscFunctionBegin;
184567a45760SJunchao Zhang   *array = NULL;
184667a45760SJunchao Zhang   PetscFunctionReturn(0);
184767a45760SJunchao Zhang }
184867a45760SJunchao Zhang 
184967a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185067a45760SJunchao Zhang {
185167a45760SJunchao Zhang   PetscFunctionBegin;
185267a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
185367a45760SJunchao Zhang   PetscFunctionReturn(0);
185467a45760SJunchao Zhang }
185567a45760SJunchao Zhang 
185667a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185767a45760SJunchao Zhang {
185867a45760SJunchao Zhang   PetscFunctionBegin;
185967a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
186067a45760SJunchao Zhang   *array         = NULL;
18617e8381f9SStefano Zampini   PetscFunctionReturn(0);
18627e8381f9SStefano Zampini }
18637e8381f9SStefano Zampini 
1864042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18659ae82921SPaul Mullowney {
1866aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18677c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18689ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1869213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
18709ae82921SPaul Mullowney   PetscErrorCode               ierr;
1871aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1872abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1873b06137fdSPaul Mullowney   cudaError_t                  err;
18749ae82921SPaul Mullowney 
18759ae82921SPaul Mullowney   PetscFunctionBegin;
1876e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1877c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1878a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1879a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1880afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
188185ba7357SStefano Zampini 
1882e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
188385ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1884afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
188505035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
18864863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
188785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1888a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
188934d6c7a5SJose E. Roman     } else {
1890abb89eb1SStefano Zampini       PetscInt nnz;
189185ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
18927c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1893a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
18947c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
189581902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1896a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1897a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18989ae82921SPaul Mullowney       try {
18999ae82921SPaul Mullowney         if (a->compressedrow.use) {
19009ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
19019ae82921SPaul Mullowney           ii   = a->compressedrow.i;
19029ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
19039ae82921SPaul Mullowney         } else {
1904213423ffSJunchao Zhang           m    = A->rmap->n;
1905213423ffSJunchao Zhang           ii   = a->i;
1906e6e9a74fSStefano Zampini           ridx = NULL;
19079ae82921SPaul Mullowney         }
1908e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1909e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1910abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1911abb89eb1SStefano Zampini         else nnz = a->nz;
19129ae82921SPaul Mullowney 
191385ba7357SStefano Zampini         /* create cusparse matrix */
1914abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1915aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
191657d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
191757d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
191857d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
19199ae82921SPaul Mullowney 
1920afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
19217656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
19227656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1923afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19247656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19257656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
192657d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1927b06137fdSPaul Mullowney 
1928aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1929aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1930aa372e3fSPaul Mullowney           /* set the matrix */
1931afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1932afb2bd1cSJunchao Zhang           mat->num_rows = m;
1933afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1934abb89eb1SStefano Zampini           mat->num_entries = nnz;
1935afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1936afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
19379ae82921SPaul Mullowney 
1938abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1939abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1940aa372e3fSPaul Mullowney 
1941abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1942abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1943aa372e3fSPaul Mullowney 
1944aa372e3fSPaul Mullowney           /* assign the pointer */
1945afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1946afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1947afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1948afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1949afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1950afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1951afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1952afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1953afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1954afb2bd1cSJunchao Zhang           }
1955afb2bd1cSJunchao Zhang          #endif
1956aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1957afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1958afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1959afb2bd1cSJunchao Zhang          #else
1960afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1961afb2bd1cSJunchao Zhang           mat->num_rows = m;
1962afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1963abb89eb1SStefano Zampini           mat->num_entries = nnz;
1964afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1965afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1966aa372e3fSPaul Mullowney 
1967abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1968abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1969aa372e3fSPaul Mullowney 
1970abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1971abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1972aa372e3fSPaul Mullowney 
1973aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
197457d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1975aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1976aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1977afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1978afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1979afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1980afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
198157d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1982aa372e3fSPaul Mullowney           /* assign the pointer */
1983aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1984aa372e3fSPaul Mullowney 
1985afb2bd1cSJunchao Zhang           if (mat) {
1986afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1987afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1988afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1989afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1990087f3262SPaul Mullowney           }
1991afb2bd1cSJunchao Zhang          #endif
1992087f3262SPaul Mullowney         }
1993ca45077fSPaul Mullowney 
1994aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1995213423ffSJunchao Zhang         if (a->compressedrow.use) {
1996213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1997aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1998aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1999213423ffSJunchao Zhang           tmp = m;
2000213423ffSJunchao Zhang         } else {
2001213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2002213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2003213423ffSJunchao Zhang           tmp = 0;
2004213423ffSJunchao Zhang         }
2005213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2006aa372e3fSPaul Mullowney 
2007aa372e3fSPaul Mullowney         /* assign the pointer */
2008aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
20099ae82921SPaul Mullowney       } catch(char *ex) {
201098921bdaSJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
20119ae82921SPaul Mullowney       }
201205035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
201385ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
201434d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
201534d6c7a5SJose E. Roman     }
2016abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
20179ae82921SPaul Mullowney   }
20189ae82921SPaul Mullowney   PetscFunctionReturn(0);
20199ae82921SPaul Mullowney }
20209ae82921SPaul Mullowney 
2021c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
2022aa372e3fSPaul Mullowney {
2023aa372e3fSPaul Mullowney   template <typename Tuple>
2024aa372e3fSPaul Mullowney   __host__ __device__
2025aa372e3fSPaul Mullowney   void operator()(Tuple t)
2026aa372e3fSPaul Mullowney   {
2027aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2028aa372e3fSPaul Mullowney   }
2029aa372e3fSPaul Mullowney };
2030aa372e3fSPaul Mullowney 
20317e8381f9SStefano Zampini struct VecCUDAEquals
20327e8381f9SStefano Zampini {
20337e8381f9SStefano Zampini   template <typename Tuple>
20347e8381f9SStefano Zampini   __host__ __device__
20357e8381f9SStefano Zampini   void operator()(Tuple t)
20367e8381f9SStefano Zampini   {
20377e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
20387e8381f9SStefano Zampini   }
20397e8381f9SStefano Zampini };
20407e8381f9SStefano Zampini 
2041e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
2042e6e9a74fSStefano Zampini {
2043e6e9a74fSStefano Zampini   template <typename Tuple>
2044e6e9a74fSStefano Zampini   __host__ __device__
2045e6e9a74fSStefano Zampini   void operator()(Tuple t)
2046e6e9a74fSStefano Zampini   {
2047e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2048e6e9a74fSStefano Zampini   }
2049e6e9a74fSStefano Zampini };
2050e6e9a74fSStefano Zampini 
2051afb2bd1cSJunchao Zhang struct MatMatCusparse {
2052ccdfe979SStefano Zampini   PetscBool             cisdense;
2053ccdfe979SStefano Zampini   PetscScalar           *Bt;
2054ccdfe979SStefano Zampini   Mat                   X;
2055fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2056fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2057fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2058b4285af6SJunchao Zhang 
2059afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2060fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2061afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2062afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2063afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2064afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2065b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2066b4285af6SJunchao Zhang   void                  *dBuffer4;
2067b4285af6SJunchao Zhang   void                  *dBuffer5;
2068b4285af6SJunchao Zhang  #endif
2069fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2070fcdce8c4SStefano Zampini   void                  *mmBuffer;
2071fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2072fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2073afb2bd1cSJunchao Zhang #endif
2074afb2bd1cSJunchao Zhang };
2075ccdfe979SStefano Zampini 
2076ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2077ccdfe979SStefano Zampini {
2078ccdfe979SStefano Zampini   PetscErrorCode   ierr;
2079ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2080ccdfe979SStefano Zampini   cudaError_t      cerr;
2081fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2082fcdce8c4SStefano Zampini   cusparseStatus_t stat;
2083fcdce8c4SStefano Zampini  #endif
2084ccdfe979SStefano Zampini 
2085ccdfe979SStefano Zampini   PetscFunctionBegin;
2086ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2087fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2088afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2089fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2090afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2091afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2092fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2093b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2094b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2095b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2096b4285af6SJunchao Zhang  #endif
2097b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2098b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2099afb2bd1cSJunchao Zhang  #endif
2100ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2101ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
2102ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2103ccdfe979SStefano Zampini }
2104ccdfe979SStefano Zampini 
2105ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2106ccdfe979SStefano Zampini 
2107ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2108ccdfe979SStefano Zampini {
2109ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2110ccdfe979SStefano Zampini   Mat                          A,B;
2111afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2112ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2113ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2114ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2115ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2116ccdfe979SStefano Zampini   const PetscScalar            *barray;
2117ccdfe979SStefano Zampini   PetscScalar                  *carray;
2118ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2119ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2120ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2121ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2122ccdfe979SStefano Zampini 
2123ccdfe979SStefano Zampini   PetscFunctionBegin;
2124ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2125e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2126ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2127ccdfe979SStefano Zampini   A    = product->A;
2128ccdfe979SStefano Zampini   B    = product->B;
2129ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
213098921bdaSJacob Faibussowitsch   if (!flg) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2131ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2132ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2133ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2134ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2135ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2136ccdfe979SStefano Zampini   switch (product->type) {
2137ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2138ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2139ccdfe979SStefano Zampini     mat = cusp->mat;
2140ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2141ccdfe979SStefano Zampini     m   = A->rmap->n;
2142ccdfe979SStefano Zampini     n   = B->cmap->n;
2143ccdfe979SStefano Zampini     break;
2144ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
21451a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2146e6e9a74fSStefano Zampini       mat = cusp->mat;
2147e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2148e6e9a74fSStefano Zampini     } else {
21493606e59fSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2150ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2151ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2152e6e9a74fSStefano Zampini     }
2153ccdfe979SStefano Zampini     m = A->cmap->n;
2154ccdfe979SStefano Zampini     n = B->cmap->n;
2155ccdfe979SStefano Zampini     break;
2156ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2157ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2158ccdfe979SStefano Zampini     mat = cusp->mat;
2159ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2160ccdfe979SStefano Zampini     m   = A->rmap->n;
2161ccdfe979SStefano Zampini     n   = B->rmap->n;
2162ccdfe979SStefano Zampini     break;
2163ccdfe979SStefano Zampini   default:
216498921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2165ccdfe979SStefano Zampini   }
2166e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2167ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2168ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2169ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2170afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2171ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2172afb2bd1cSJunchao Zhang 
2173ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2174c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2175c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2176c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2177c8378d12SStefano Zampini   } else {
2178c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2179c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2180c8378d12SStefano Zampini   }
2181c8378d12SStefano Zampini 
2182c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2183afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2184afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2185a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2186afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2187fcdce8c4SStefano Zampini     size_t mmBufferSize;
2188afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2189afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2190afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2191afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2192afb2bd1cSJunchao Zhang     }
2193c8378d12SStefano Zampini 
2194afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2195afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2196afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2197afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2198afb2bd1cSJunchao Zhang     }
2199afb2bd1cSJunchao Zhang 
2200afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2201afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2202afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2203afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2204afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2205afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2206afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2207afb2bd1cSJunchao Zhang     }
2208afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2209afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2210afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2211fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2212fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2213ee7b52eaSHong Zhang       cudaError_t cerr;
2214fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2215fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2216fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2217fcdce8c4SStefano Zampini     }
2218afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2219afb2bd1cSJunchao Zhang   } else {
2220afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2221afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2222afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2223afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2224afb2bd1cSJunchao Zhang   }
2225afb2bd1cSJunchao Zhang 
2226afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2227afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2228afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2229afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2230fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2231afb2bd1cSJunchao Zhang  #else
2232afb2bd1cSJunchao Zhang   PetscInt k;
2233afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2234ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2235ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2236ccdfe979SStefano Zampini     cublasStatus_t cerr;
2237ccdfe979SStefano Zampini 
2238ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2239ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2240ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2241ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2242ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2243ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2244ccdfe979SStefano Zampini     blda = B->cmap->n;
2245afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2246afb2bd1cSJunchao Zhang   } else {
2247afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2248ccdfe979SStefano Zampini   }
2249ccdfe979SStefano Zampini 
2250afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2251ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2252afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2253ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2254ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2255ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2256ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2257ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2258afb2bd1cSJunchao Zhang  #endif
2259c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2260c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2261ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2262ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2263ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2264ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2265ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2266ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2267ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2268ccdfe979SStefano Zampini   } else {
2269ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2270ccdfe979SStefano Zampini   }
2271ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2272ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2273ccdfe979SStefano Zampini   }
2274ccdfe979SStefano Zampini   if (!biscuda) {
2275ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2276ccdfe979SStefano Zampini   }
2277ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2278ccdfe979SStefano Zampini }
2279ccdfe979SStefano Zampini 
2280ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2281ccdfe979SStefano Zampini {
2282ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2283ccdfe979SStefano Zampini   Mat                A,B;
2284ccdfe979SStefano Zampini   PetscInt           m,n;
2285ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2286ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2287ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2288ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2289ccdfe979SStefano Zampini 
2290ccdfe979SStefano Zampini   PetscFunctionBegin;
2291ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2292e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2293ccdfe979SStefano Zampini   A    = product->A;
2294ccdfe979SStefano Zampini   B    = product->B;
2295ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
229698921bdaSJacob Faibussowitsch   if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2297ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2298e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2299ccdfe979SStefano Zampini   switch (product->type) {
2300ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2301ccdfe979SStefano Zampini     m = A->rmap->n;
2302ccdfe979SStefano Zampini     n = B->cmap->n;
2303ccdfe979SStefano Zampini     break;
2304ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2305ccdfe979SStefano Zampini     m = A->cmap->n;
2306ccdfe979SStefano Zampini     n = B->cmap->n;
2307ccdfe979SStefano Zampini     break;
2308ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2309ccdfe979SStefano Zampini     m = A->rmap->n;
2310ccdfe979SStefano Zampini     n = B->rmap->n;
2311ccdfe979SStefano Zampini     break;
2312ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2313ccdfe979SStefano Zampini     m = B->cmap->n;
2314ccdfe979SStefano Zampini     n = B->cmap->n;
2315ccdfe979SStefano Zampini     break;
2316ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2317ccdfe979SStefano Zampini     m = B->rmap->n;
2318ccdfe979SStefano Zampini     n = B->rmap->n;
2319ccdfe979SStefano Zampini     break;
2320ccdfe979SStefano Zampini   default:
232198921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2322ccdfe979SStefano Zampini   }
2323ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2324ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2325ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2326ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2327ccdfe979SStefano Zampini 
2328ccdfe979SStefano Zampini   /* product data */
2329ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2330ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2331afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2332afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2333ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2334afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2335ccdfe979SStefano Zampini   }
2336afb2bd1cSJunchao Zhang  #endif
2337ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2338ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2339ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2340ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2341ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2342ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2343ccdfe979SStefano Zampini     } else {
2344ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2345ccdfe979SStefano Zampini     }
2346ccdfe979SStefano Zampini   }
2347ccdfe979SStefano Zampini   C->product->data    = mmdata;
2348ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2349ccdfe979SStefano Zampini 
2350ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2351ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2352ccdfe979SStefano Zampini }
2353ccdfe979SStefano Zampini 
2354fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2355ccdfe979SStefano Zampini {
2356ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2357fcdce8c4SStefano Zampini   Mat                          A,B;
2358fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2359fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2360fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2361fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2362fcdce8c4SStefano Zampini   PetscBool                    flg;
2363ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2364fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2365fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2366fcdce8c4SStefano Zampini   MatProductType               ptype;
2367fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2368fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2369fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2370fcdce8c4SStefano Zampini #endif
2371b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2372ccdfe979SStefano Zampini 
2373ccdfe979SStefano Zampini   PetscFunctionBegin;
2374ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2375e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2376fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
237798921bdaSJacob Faibussowitsch   if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2378fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2379fcdce8c4SStefano Zampini   A = product->A;
2380fcdce8c4SStefano Zampini   B = product->B;
2381fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2382fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2383fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2384e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2385fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
238698921bdaSJacob Faibussowitsch     if (!Cmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2387fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2388e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2389fcdce8c4SStefano Zampini     goto finalize;
2390fcdce8c4SStefano Zampini   }
2391fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2392fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239398921bdaSJacob Faibussowitsch   if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2394fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
239598921bdaSJacob Faibussowitsch   if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2396fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2397fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2398fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2399fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2400fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2401e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2402e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2403e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2404fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2405fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2406fcdce8c4SStefano Zampini 
2407fcdce8c4SStefano Zampini   ptype = product->type;
2408fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2409fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2410fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2411fa046f9fSJunchao Zhang   }
2412fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2413fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2414fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2415fa046f9fSJunchao Zhang   }
2416fcdce8c4SStefano Zampini   switch (ptype) {
2417fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2418fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2419fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2420fcdce8c4SStefano Zampini     break;
2421fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2422fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2423fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2424fcdce8c4SStefano Zampini     break;
2425fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2426fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2427fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2428fcdce8c4SStefano Zampini     break;
2429fcdce8c4SStefano Zampini   default:
243098921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2431fcdce8c4SStefano Zampini   }
2432fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
243398921bdaSJacob Faibussowitsch   if (!Amat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
243498921bdaSJacob Faibussowitsch   if (!Bmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
243598921bdaSJacob Faibussowitsch   if (!Cmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2436fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2437fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2438fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2439e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2440e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2441e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2442fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2443fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2444fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2445b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2446b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2447b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2448b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2449b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2450b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2451b4285af6SJunchao Zhang   #else
2452b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2453fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2454fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2455fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2456b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2457fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2458fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2459b4285af6SJunchao Zhang   #endif
2460fcdce8c4SStefano Zampini #else
2461b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2462fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2463fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2464fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2465fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2466fcdce8c4SStefano Zampini #endif
2467fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2468fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2469fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2470fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2471fcdce8c4SStefano Zampini finalize:
2472fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2473*7d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2474fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2475*7d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2476fcdce8c4SStefano Zampini   c->reallocs         = 0;
2477fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2478fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2479fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2480fcdce8c4SStefano Zampini   C->num_ass++;
2481ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2482ccdfe979SStefano Zampini }
2483fcdce8c4SStefano Zampini 
2484fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2485fcdce8c4SStefano Zampini {
2486fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2487fcdce8c4SStefano Zampini   Mat                          A,B;
2488fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2489fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2490fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2491fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2492fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2493fcdce8c4SStefano Zampini   PetscBool                    flg;
2494fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2495fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2496fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2497fcdce8c4SStefano Zampini   MatProductType               ptype;
2498fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2499fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2500fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2501fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2502fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2503fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2504fcdce8c4SStefano Zampini #else
2505fcdce8c4SStefano Zampini   int                          cnz;
2506fcdce8c4SStefano Zampini #endif
2507b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2508fcdce8c4SStefano Zampini 
2509fcdce8c4SStefano Zampini   PetscFunctionBegin;
2510fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2511e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2512fcdce8c4SStefano Zampini   A    = product->A;
2513fcdce8c4SStefano Zampini   B    = product->B;
2514fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
251598921bdaSJacob Faibussowitsch   if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2516fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
251798921bdaSJacob Faibussowitsch   if (!flg) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2518fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2519fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2520fcdce8c4SStefano Zampini   /* product data */
2521fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2522fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2523fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2524fcdce8c4SStefano Zampini 
2525fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2526fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2527d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2528d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2529d60bce21SJunchao Zhang   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2530d60bce21SJunchao Zhang   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2531d60bce21SJunchao Zhang 
2532fcdce8c4SStefano Zampini   ptype = product->type;
2533fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2534fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2535fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2536fa046f9fSJunchao Zhang   }
2537fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2538fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2539fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2540fa046f9fSJunchao Zhang   }
2541fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2542fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2543fcdce8c4SStefano Zampini   switch (ptype) {
2544fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2545fcdce8c4SStefano Zampini     m = A->rmap->n;
2546fcdce8c4SStefano Zampini     n = B->cmap->n;
2547fcdce8c4SStefano Zampini     k = A->cmap->n;
2548fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2549fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2550fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2551fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2552fcdce8c4SStefano Zampini     break;
2553fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2554fcdce8c4SStefano Zampini     m = A->cmap->n;
2555fcdce8c4SStefano Zampini     n = B->cmap->n;
2556fcdce8c4SStefano Zampini     k = A->rmap->n;
25573606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2558fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2559fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2560fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2561fcdce8c4SStefano Zampini     break;
2562fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2563fcdce8c4SStefano Zampini     m = A->rmap->n;
2564fcdce8c4SStefano Zampini     n = B->rmap->n;
2565fcdce8c4SStefano Zampini     k = A->cmap->n;
25663606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2567fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2568fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2569fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2570fcdce8c4SStefano Zampini     break;
2571fcdce8c4SStefano Zampini   default:
257298921bdaSJacob Faibussowitsch     SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2573fcdce8c4SStefano Zampini   }
2574fcdce8c4SStefano Zampini 
2575fcdce8c4SStefano Zampini   /* create cusparse matrix */
2576fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2577fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2578fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2579fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2580fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2581fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2582fcdce8c4SStefano Zampini 
2583fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2584fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2585fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2586fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2587fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2588fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2589fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2590fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2591fcdce8c4SStefano Zampini   } else {
2592fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2593fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2594fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2595fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2596fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2597fcdce8c4SStefano Zampini   }
2598fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2599fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2600fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2601fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2602fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2603fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2604fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2605fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2606fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2607fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2608fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2609fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2610fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2611fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2612fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2613fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2614fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2615fcdce8c4SStefano Zampini     c->nz = 0;
2616fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2617fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2618fcdce8c4SStefano Zampini     goto finalizesym;
2619fcdce8c4SStefano Zampini   }
2620fcdce8c4SStefano Zampini 
262198921bdaSJacob Faibussowitsch   if (!Amat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
262298921bdaSJacob Faibussowitsch   if (!Bmat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2623fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2624fcdce8c4SStefano Zampini   if (!biscompressed) {
2625fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2626fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2627fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2628fcdce8c4SStefano Zampini #endif
2629fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2630fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2631fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2632fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2633fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2634fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2635fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2636fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2637fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2638fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2639fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2640fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2641fcdce8c4SStefano Zampini     }
2642fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2643fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2644fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2645fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2646fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2647fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2648fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2649fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2650fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2651fcdce8c4SStefano Zampini     }
2652fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2653fcdce8c4SStefano Zampini #endif
2654fcdce8c4SStefano Zampini   }
2655e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2656e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2657fcdce8c4SStefano Zampini   /* precompute flops count */
2658fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2659fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2660fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2661fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2662fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2663fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2664fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2665fcdce8c4SStefano Zampini       }
2666fcdce8c4SStefano Zampini     }
2667fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2668fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2669fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2670fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2671fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2672fcdce8c4SStefano Zampini     }
2673fcdce8c4SStefano Zampini   } else { /* TODO */
2674fcdce8c4SStefano Zampini     flops = 0.;
2675fcdce8c4SStefano Zampini   }
2676fcdce8c4SStefano Zampini 
2677fcdce8c4SStefano Zampini   mmdata->flops = flops;
2678fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2679b4285af6SJunchao Zhang 
2680fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2681fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2682fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2683fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2684fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2685fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2686fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2687b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2688b4285af6SJunchao Zhang  {
2689b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2690b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2691b4285af6SJunchao Zhang   */
2692b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2693b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2694b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2695b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2696b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2697b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2698b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2699b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2700b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2701b4285af6SJunchao Zhang 
2702b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2703b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2704b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2705b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2706b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2707b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2708b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2709b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2710b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2711b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2712b4285af6SJunchao Zhang 
2713b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2714b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2715b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2716b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2717b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2718b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2719b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2720b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2721b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2722b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2723b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2724b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2725b4285af6SJunchao Zhang 
2726b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2727b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2728b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2729b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2730b4285af6SJunchao Zhang   /* allocate matrix C */
2731b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2732b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2733b4285af6SJunchao Zhang   /* update matC with the new pointers */
2734b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2735b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2736b4285af6SJunchao Zhang 
2737b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2738b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2739b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2740b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2741b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2742b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2743b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2744b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2745b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2746b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2747b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2748b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2749b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2750*7d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2751b4285af6SJunchao Zhang  }
2752ae37ee31SJunchao Zhang  #else
2753b4285af6SJunchao Zhang   size_t bufSize2;
2754fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2755b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2756fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2757fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2758fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2759bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2760fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2761b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2762fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2763fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2764fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2765fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2766b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2767fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2768fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2769fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2770fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2771fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2772fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2773fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2774fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2775bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2776fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2777b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2778fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2779fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2780fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2781fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2782fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2783fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
2784*7d3de750SJacob Faibussowitsch   ierr = PetscInfo(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2785fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2786fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2787fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2788fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2789fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2790fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2791b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2792fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2793fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2794ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2795fcdce8c4SStefano Zampini #else
2796fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2797b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2798fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2799fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2800fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2801fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2802fcdce8c4SStefano Zampini   c->nz = cnz;
2803fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2804fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2805fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2806fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2807fcdce8c4SStefano Zampini 
2808fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2809fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2810fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2811fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2812b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2813fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2814fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2815fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2816fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2817fcdce8c4SStefano Zampini #endif
2818fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2819fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2820fcdce8c4SStefano Zampini finalizesym:
2821fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2822fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2823fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2824fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2825fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2826fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2827fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2828fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2829fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2830fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2831fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2832fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2833fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2834fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2835fcdce8c4SStefano Zampini   } else {
2836fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2837fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2838fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2839fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2840fcdce8c4SStefano Zampini   }
2841fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2842fcdce8c4SStefano Zampini     PetscInt r = 0;
2843fcdce8c4SStefano Zampini     c->i[0] = 0;
2844fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2845fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2846fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2847fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2848fcdce8c4SStefano Zampini     }
2849fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2850fcdce8c4SStefano Zampini   }
2851fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2852fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2853fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2854fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2855fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2856fcdce8c4SStefano Zampini   c->rmax = 0;
2857fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2858fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2859fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2860fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2861fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2862fcdce8c4SStefano Zampini   }
2863fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2864fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2865fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2866fcdce8c4SStefano Zampini 
2867fcdce8c4SStefano Zampini   C->nonzerostate++;
2868fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2869fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2870fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2871fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2872fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2873fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2874fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2875abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2876fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2877fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2878fcdce8c4SStefano Zampini   }
2879fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2880fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2881fcdce8c4SStefano Zampini }
2882fcdce8c4SStefano Zampini 
2883fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2884fcdce8c4SStefano Zampini 
2885fcdce8c4SStefano Zampini /* handles sparse or dense B */
2886fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2887fcdce8c4SStefano Zampini {
2888fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2889fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2890fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2891fcdce8c4SStefano Zampini 
2892fcdce8c4SStefano Zampini   PetscFunctionBegin;
2893fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2894fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2895abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2896fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2897fcdce8c4SStefano Zampini   }
2898fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2899fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2900fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2901fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2902fcdce8c4SStefano Zampini     }
2903fcdce8c4SStefano Zampini   }
290465e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
290565e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
290665e4b4d4SStefano Zampini     switch (product->type) {
290765e4b4d4SStefano Zampini     case MATPRODUCT_AB:
290865e4b4d4SStefano Zampini       if (product->api_user) {
290965e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
291065e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291165e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291265e4b4d4SStefano Zampini       } else {
291365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
29143e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291665e4b4d4SStefano Zampini       }
291765e4b4d4SStefano Zampini       break;
291865e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
291965e4b4d4SStefano Zampini       if (product->api_user) {
292065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
292165e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292365e4b4d4SStefano Zampini       } else {
292465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
29253e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292765e4b4d4SStefano Zampini       }
292865e4b4d4SStefano Zampini       break;
292965e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
293065e4b4d4SStefano Zampini       if (product->api_user) {
293165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
293265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293465e4b4d4SStefano Zampini       } else {
293565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
29363e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293865e4b4d4SStefano Zampini       }
293965e4b4d4SStefano Zampini       break;
294065e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
294165e4b4d4SStefano Zampini       if (product->api_user) {
294265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
294365e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294565e4b4d4SStefano Zampini       } else {
294665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
29473e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294965e4b4d4SStefano Zampini       }
295065e4b4d4SStefano Zampini       break;
295165e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
295265e4b4d4SStefano Zampini       if (product->api_user) {
295365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
295465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295665e4b4d4SStefano Zampini       } else {
295765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
29583e662e0bSHong Zhang         ierr = PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
296065e4b4d4SStefano Zampini       }
296165e4b4d4SStefano Zampini       break;
296265e4b4d4SStefano Zampini     default:
296365e4b4d4SStefano Zampini       break;
296465e4b4d4SStefano Zampini     }
296565e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
296665e4b4d4SStefano Zampini   }
296765e4b4d4SStefano Zampini   /* dispatch */
2968fcdce8c4SStefano Zampini   if (isdense) {
2969ccdfe979SStefano Zampini     switch (product->type) {
2970ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2971ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2972ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2973ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2974ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2975fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2976fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2977fcdce8c4SStefano Zampini       } else {
2978fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2979fcdce8c4SStefano Zampini       }
2980fcdce8c4SStefano Zampini       break;
2981fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2982fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2983fcdce8c4SStefano Zampini       break;
2984ccdfe979SStefano Zampini     default:
2985ccdfe979SStefano Zampini       break;
2986ccdfe979SStefano Zampini     }
2987fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2988fcdce8c4SStefano Zampini     switch (product->type) {
2989fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2990fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2991fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2992fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2993fcdce8c4SStefano Zampini       break;
2994fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2995fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2996fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2997fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2998fcdce8c4SStefano Zampini       break;
2999fcdce8c4SStefano Zampini     default:
3000fcdce8c4SStefano Zampini       break;
3001fcdce8c4SStefano Zampini     }
3002fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
3003fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3004fcdce8c4SStefano Zampini   }
3005ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3006ccdfe979SStefano Zampini }
3007ccdfe979SStefano Zampini 
30086fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
30099ae82921SPaul Mullowney {
3010b175d8bbSPaul Mullowney   PetscErrorCode ierr;
30119ae82921SPaul Mullowney 
30129ae82921SPaul Mullowney   PetscFunctionBegin;
3013e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3014e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3015e6e9a74fSStefano Zampini }
3016e6e9a74fSStefano Zampini 
3017e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3018e6e9a74fSStefano Zampini {
3019e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3020e6e9a74fSStefano Zampini 
3021e6e9a74fSStefano Zampini   PetscFunctionBegin;
3022e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3023e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3024e6e9a74fSStefano Zampini }
3025e6e9a74fSStefano Zampini 
3026e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3027e6e9a74fSStefano Zampini {
3028e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3029e6e9a74fSStefano Zampini 
3030e6e9a74fSStefano Zampini   PetscFunctionBegin;
3031e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3032e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3033e6e9a74fSStefano Zampini }
3034e6e9a74fSStefano Zampini 
3035e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3036e6e9a74fSStefano Zampini {
3037e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3038e6e9a74fSStefano Zampini 
3039e6e9a74fSStefano Zampini   PetscFunctionBegin;
3040e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
30419ae82921SPaul Mullowney   PetscFunctionReturn(0);
30429ae82921SPaul Mullowney }
30439ae82921SPaul Mullowney 
30446fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3045ca45077fSPaul Mullowney {
3046b175d8bbSPaul Mullowney   PetscErrorCode ierr;
3047ca45077fSPaul Mullowney 
3048ca45077fSPaul Mullowney   PetscFunctionBegin;
3049e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3050ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3051ca45077fSPaul Mullowney }
3052ca45077fSPaul Mullowney 
3053a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3054a0e72f99SJunchao Zhang {
3055a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
3056a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3057a0e72f99SJunchao Zhang }
3058a0e72f99SJunchao Zhang 
3059afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3060e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
30619ae82921SPaul Mullowney {
30629ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3063aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
30649ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3065e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3066b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
3067aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
3068e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3069e6e9a74fSStefano Zampini   PetscBool                    compressed;
3070afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3071afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3072afb2bd1cSJunchao Zhang #endif
30736e111a19SKarl Rupp 
30749ae82921SPaul Mullowney   PetscFunctionBegin;
3075e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3076e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
3077afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3078d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3079e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3080e6e9a74fSStefano Zampini   }
308134d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
308234d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3083e6e9a74fSStefano Zampini   if (!trans) {
30849ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3085e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3086e6e9a74fSStefano Zampini   } else {
30871a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3088e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3089e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3090e6e9a74fSStefano Zampini     } else {
30913606e59fSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3092e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3093e6e9a74fSStefano Zampini     }
3094e6e9a74fSStefano Zampini   }
3095e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3096e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3097213423ffSJunchao Zhang 
3098e6e9a74fSStefano Zampini   try {
3099e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3100213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3101213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3102afb2bd1cSJunchao Zhang 
310385ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3104e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3105afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3106afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3107afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3108afb2bd1cSJunchao Zhang       */
3109e6e9a74fSStefano Zampini       xptr = xarray;
3110afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3111213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3112afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3113afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3114afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3115afb2bd1cSJunchao Zhang        */
3116afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3117afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3118afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3119afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3120afb2bd1cSJunchao Zhang       }
3121afb2bd1cSJunchao Zhang      #endif
3122e6e9a74fSStefano Zampini     } else {
3123afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3124afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3125afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3126afb2bd1cSJunchao Zhang        */
3127afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3128e6e9a74fSStefano Zampini       dptr = zarray;
3129e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3130afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3131e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3132a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3133e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3134e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3135e6e9a74fSStefano Zampini       }
3136afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3137afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3138afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3139afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3140afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3141afb2bd1cSJunchao Zhang       }
3142afb2bd1cSJunchao Zhang      #endif
3143e6e9a74fSStefano Zampini     }
31449ae82921SPaul Mullowney 
3145afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3146aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3147afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3148afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3149afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3150ee7b52eaSHong Zhang         cudaError_t cerr;
3151afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3152afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3153afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3154afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3155afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3156afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3157afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3158afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3159afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3160afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3161afb2bd1cSJunchao Zhang 
3162afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3163afb2bd1cSJunchao Zhang       } else {
3164afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3165afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3166afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3167afb2bd1cSJunchao Zhang       }
3168afb2bd1cSJunchao Zhang 
3169afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3170afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
31713606e59fSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3172afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3173afb2bd1cSJunchao Zhang                                beta,
3174afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3175afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3176afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3177afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3178afb2bd1cSJunchao Zhang      #else
31797656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3180e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3181a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3182afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3183aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3184e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
318557d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3186afb2bd1cSJunchao Zhang      #endif
3187aa372e3fSPaul Mullowney     } else {
3188213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3189afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3190afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3191afb2bd1cSJunchao Zhang        #else
3192301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3193e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3194afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3195e6e9a74fSStefano Zampini                                  xptr, beta,
319657d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3197afb2bd1cSJunchao Zhang        #endif
3198a65300a6SPaul Mullowney       }
3199aa372e3fSPaul Mullowney     }
3200958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3201aa372e3fSPaul Mullowney 
3202e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3203213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3204213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3205213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3206e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3207213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
32087656d835SStefano Zampini         }
3209213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3210c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
32117656d835SStefano Zampini       }
32127656d835SStefano Zampini 
3213213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3214213423ffSJunchao Zhang       if (compressed) {
3215e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3216a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3217a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3218a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3219a0e72f99SJunchao Zhang          */
3220a0e72f99SJunchao Zhang        #if 0
3221a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3222a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3223a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3224e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3225c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3226a0e72f99SJunchao Zhang        #else
3227a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3228a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3229a0e72f99SJunchao Zhang        #endif
3230958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3231e6e9a74fSStefano Zampini       }
3232e6e9a74fSStefano Zampini     } else {
3233e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3234e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3235e6e9a74fSStefano Zampini       }
3236e6e9a74fSStefano Zampini     }
3237e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3238213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3239213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
32409ae82921SPaul Mullowney   } catch(char *ex) {
324198921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
32429ae82921SPaul Mullowney   }
3243e6e9a74fSStefano Zampini   if (yy) {
3244958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3245e6e9a74fSStefano Zampini   } else {
3246e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3247e6e9a74fSStefano Zampini   }
32489ae82921SPaul Mullowney   PetscFunctionReturn(0);
32499ae82921SPaul Mullowney }
32509ae82921SPaul Mullowney 
32516fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3252ca45077fSPaul Mullowney {
3253b175d8bbSPaul Mullowney   PetscErrorCode ierr;
32546e111a19SKarl Rupp 
3255ca45077fSPaul Mullowney   PetscFunctionBegin;
3256e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3257ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3258ca45077fSPaul Mullowney }
3259ca45077fSPaul Mullowney 
32606fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
32619ae82921SPaul Mullowney {
32629ae82921SPaul Mullowney   PetscErrorCode     ierr;
3263042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3264042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
32653fa6b06aSMark Adams 
3266042217e8SBarry Smith   PetscFunctionBegin;
3267042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3268042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3269042217e8SBarry Smith     cudaError_t cerr;
3270042217e8SBarry Smith 
3271042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3272042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3273042217e8SBarry Smith     cusp->deviceMat = NULL;
3274042217e8SBarry Smith   }
32759ae82921SPaul Mullowney   PetscFunctionReturn(0);
32769ae82921SPaul Mullowney }
32779ae82921SPaul Mullowney 
32789ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3279e057df02SPaul Mullowney /*@
32809ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3281e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3282e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3283e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3284e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3285e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32869ae82921SPaul Mullowney 
3287d083f849SBarry Smith    Collective
32889ae82921SPaul Mullowney 
32899ae82921SPaul Mullowney    Input Parameters:
32909ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32919ae82921SPaul Mullowney .  m - number of rows
32929ae82921SPaul Mullowney .  n - number of columns
32939ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32949ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32950298fd71SBarry Smith          (possibly different for each row) or NULL
32969ae82921SPaul Mullowney 
32979ae82921SPaul Mullowney    Output Parameter:
32989ae82921SPaul Mullowney .  A - the matrix
32999ae82921SPaul Mullowney 
33009ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
33019ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
33029ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
33039ae82921SPaul Mullowney 
33049ae82921SPaul Mullowney    Notes:
33059ae82921SPaul Mullowney    If nnz is given then nz is ignored
33069ae82921SPaul Mullowney 
33079ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
33089ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
33099ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
33109ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
33119ae82921SPaul Mullowney 
33129ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
33130298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
33149ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
33159ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
33169ae82921SPaul Mullowney 
33179ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
33189ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
33199ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
33209ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
33219ae82921SPaul Mullowney 
33229ae82921SPaul Mullowney    Level: intermediate
33239ae82921SPaul Mullowney 
3324e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
33259ae82921SPaul Mullowney @*/
33269ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
33279ae82921SPaul Mullowney {
33289ae82921SPaul Mullowney   PetscErrorCode ierr;
33299ae82921SPaul Mullowney 
33309ae82921SPaul Mullowney   PetscFunctionBegin;
33319ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
33329ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
33339ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
33349ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
33359ae82921SPaul Mullowney   PetscFunctionReturn(0);
33369ae82921SPaul Mullowney }
33379ae82921SPaul Mullowney 
33386fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
33399ae82921SPaul Mullowney {
33409ae82921SPaul Mullowney   PetscErrorCode ierr;
3341ab25e6cbSDominic Meiser 
33429ae82921SPaul Mullowney   PetscFunctionBegin;
33439ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3344470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
33459ae82921SPaul Mullowney   } else {
3346470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3347aa372e3fSPaul Mullowney   }
3348c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3349ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3350365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3351ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3352ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3353fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3354ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
33557e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
33567e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3357ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
33589ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
33599ae82921SPaul Mullowney   PetscFunctionReturn(0);
33609ae82921SPaul Mullowney }
33619ae82921SPaul Mullowney 
3362ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
336395639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
33649ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
33659ff858a8SKarl Rupp {
33669ff858a8SKarl Rupp   PetscErrorCode ierr;
33679ff858a8SKarl Rupp 
33689ff858a8SKarl Rupp   PetscFunctionBegin;
33699ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3370ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
33719ff858a8SKarl Rupp   PetscFunctionReturn(0);
33729ff858a8SKarl Rupp }
33739ff858a8SKarl Rupp 
3374039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
337595639643SRichard Tran Mills {
3376e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3377a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3378039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3379039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3380039c6fbaSStefano Zampini   PetscScalar        *ay;
3381039c6fbaSStefano Zampini   const PetscScalar  *ax;
3382039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3383e6e9a74fSStefano Zampini 
338495639643SRichard Tran Mills   PetscFunctionBegin;
3385a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3386a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3387039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3388a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3389a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3390a587d139SMark     PetscFunctionReturn(0);
339195639643SRichard Tran Mills   }
3392039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3393a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3394a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3395e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3396e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3397039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3398039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3399039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3400039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3401039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3402039c6fbaSStefano Zampini     if (eq) {
3403039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3404039c6fbaSStefano Zampini     }
3405039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3406039c6fbaSStefano Zampini   }
3407d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3408d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3409039c6fbaSStefano Zampini 
3410039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3411039c6fbaSStefano Zampini     cusparseStatus_t stat;
3412039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3413039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3414039c6fbaSStefano Zampini     size_t           bufferSize;
3415039c6fbaSStefano Zampini     void             *buffer;
3416ee7b52eaSHong Zhang     cudaError_t      cerr;
3417039c6fbaSStefano Zampini #endif
3418039c6fbaSStefano Zampini 
3419039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3420039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3421039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3422039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3423039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3424039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3425039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3426039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3427039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3428039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3429039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3430039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3431039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3432039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3433039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3434039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3435039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3436039c6fbaSStefano Zampini #else
3437039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3438039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3439039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3440039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3441039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3442039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3443039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3444039c6fbaSStefano Zampini #endif
3445039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3446039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3447039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3448039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3449039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3450a587d139SMark     cublasHandle_t cublasv2handle;
3451039c6fbaSStefano Zampini     cublasStatus_t berr;
3452a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3453039c6fbaSStefano Zampini 
3454039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3455039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3456a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3457a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3458a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3459039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3460a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3461a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3462039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3463039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3464a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3465039c6fbaSStefano Zampini   } else {
3466a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3467d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3468a587d139SMark   }
346995639643SRichard Tran Mills   PetscFunctionReturn(0);
347095639643SRichard Tran Mills }
347195639643SRichard Tran Mills 
347233c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
347333c9ba73SStefano Zampini {
347433c9ba73SStefano Zampini   PetscErrorCode ierr;
347533c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
347633c9ba73SStefano Zampini   PetscScalar    *ay;
347733c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
347833c9ba73SStefano Zampini   cublasStatus_t berr;
347933c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
348033c9ba73SStefano Zampini 
348133c9ba73SStefano Zampini   PetscFunctionBegin;
348233c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
348333c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
348433c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
348533c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
348633c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
348733c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
348833c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
348933c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
349033c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
349133c9ba73SStefano Zampini   PetscFunctionReturn(0);
349233c9ba73SStefano Zampini }
349333c9ba73SStefano Zampini 
34943fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34953fa6b06aSMark Adams {
34963fa6b06aSMark Adams   PetscErrorCode ierr;
34977e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3498a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
34997e8381f9SStefano Zampini 
35003fa6b06aSMark Adams   PetscFunctionBegin;
35013fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
35023fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
35037e8381f9SStefano Zampini     if (spptr->mat) {
35047e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
35057e8381f9SStefano Zampini       if (matrix->values) {
35067e8381f9SStefano Zampini         both = PETSC_TRUE;
35077e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35087e8381f9SStefano Zampini       }
35097e8381f9SStefano Zampini     }
35107e8381f9SStefano Zampini     if (spptr->matTranspose) {
35117e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
35127e8381f9SStefano Zampini       if (matrix->values) {
35137e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35147e8381f9SStefano Zampini       }
35157e8381f9SStefano Zampini     }
35163fa6b06aSMark Adams   }
3517a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3518a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3519a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
35207e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3521a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
35223fa6b06aSMark Adams   PetscFunctionReturn(0);
35233fa6b06aSMark Adams }
35243fa6b06aSMark Adams 
3525a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3526a587d139SMark {
3527a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3528a587d139SMark   PetscErrorCode ierr;
3529a587d139SMark 
3530a587d139SMark   PetscFunctionBegin;
35319a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
35329a14fc28SStefano Zampini     A->boundtocpu = flg;
35339a14fc28SStefano Zampini     PetscFunctionReturn(0);
35349a14fc28SStefano Zampini   }
3535a587d139SMark   if (flg) {
3536a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3537a587d139SMark 
353833c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3539a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3540a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3541a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3542a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3543a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3544a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3545a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3546a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3547fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
354867a45760SJunchao Zhang     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3549c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3550a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3551a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3552a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3553a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3554a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3555fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3556a587d139SMark   } else {
355733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3558a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3559a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3560a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3561a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3562a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3563a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3564a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3565a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3566fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
356767a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
356867a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
356967a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
357067a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
357167a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
357267a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3573c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3574a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579a587d139SMark   }
3580a587d139SMark   A->boundtocpu = flg;
3581ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3582ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3583ea500dcfSRichard Tran Mills   } else {
3584ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3585ea500dcfSRichard Tran Mills   }
3586a587d139SMark   PetscFunctionReturn(0);
3587a587d139SMark }
3588a587d139SMark 
358949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
35909ae82921SPaul Mullowney {
35919ae82921SPaul Mullowney   PetscErrorCode   ierr;
3592aa372e3fSPaul Mullowney   cusparseStatus_t stat;
359349735bf3SStefano Zampini   Mat              B;
35949ae82921SPaul Mullowney 
35959ae82921SPaul Mullowney   PetscFunctionBegin;
3596a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
359749735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
359849735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
359949735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
360049735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
360149735bf3SStefano Zampini   }
360249735bf3SStefano Zampini   B = *newmat;
360349735bf3SStefano Zampini 
360434136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
360534136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
360634136279SStefano Zampini 
360749735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
36089ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3609e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3610e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3611e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3612a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
36131a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3614d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3615a435da06SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3616a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3617a435da06SStefano Zampini      #else
3618d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3619a435da06SStefano Zampini      #endif
3620d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3621d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3622d8132acaSStefano Zampini      #endif
36231a2c6b5cSJunchao Zhang       B->spptr = spptr;
36249ae82921SPaul Mullowney     } else {
3625e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3626e6e9a74fSStefano Zampini 
3627e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3628e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3629a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3630e6e9a74fSStefano Zampini       B->spptr = spptr;
36319ae82921SPaul Mullowney     }
3632e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
363349735bf3SStefano Zampini   }
3634693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
36359ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
36361a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
36379ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
363895639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3639693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
36402205254eSKarl Rupp 
3641e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
36429ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3643bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3644ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
3645ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3646ae48a8d0SStefano Zampini #endif
3647365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
36489ae82921SPaul Mullowney   PetscFunctionReturn(0);
36499ae82921SPaul Mullowney }
36509ae82921SPaul Mullowney 
365102fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
365202fe1965SBarry Smith {
365302fe1965SBarry Smith   PetscErrorCode ierr;
365402fe1965SBarry Smith 
365502fe1965SBarry Smith   PetscFunctionBegin;
365602fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
36570ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
365802fe1965SBarry Smith   PetscFunctionReturn(0);
365902fe1965SBarry Smith }
366002fe1965SBarry Smith 
36613ca39a21SBarry Smith /*MC
3662e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3663e057df02SPaul Mullowney 
3664e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
36652692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
36662692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3667e057df02SPaul Mullowney 
3668e057df02SPaul Mullowney    Options Database Keys:
3669e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3670aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3671a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3672365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3673e057df02SPaul Mullowney 
3674e057df02SPaul Mullowney   Level: beginner
3675e057df02SPaul Mullowney 
36768468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3677e057df02SPaul Mullowney M*/
36787f756511SDominic Meiser 
3679bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
36800f39cd5aSBarry Smith 
36813ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
368242c9c57cSBarry Smith {
368342c9c57cSBarry Smith   PetscErrorCode ierr;
368442c9c57cSBarry Smith 
368542c9c57cSBarry Smith   PetscFunctionBegin;
3686bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
36873ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36883ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36903ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3691bddcd29dSMark Adams 
369242c9c57cSBarry Smith   PetscFunctionReturn(0);
369342c9c57cSBarry Smith }
369429b38603SBarry Smith 
3695470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36967f756511SDominic Meiser {
3697e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
36987f756511SDominic Meiser   cusparseStatus_t stat;
36997f756511SDominic Meiser 
37007f756511SDominic Meiser   PetscFunctionBegin;
37017f756511SDominic Meiser   if (*cusparsestruct) {
3702e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3703e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
37047f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
370581902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
37067e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
37077e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3708a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
37097e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3710e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
37117f756511SDominic Meiser   }
37127f756511SDominic Meiser   PetscFunctionReturn(0);
37137f756511SDominic Meiser }
37147f756511SDominic Meiser 
37157f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
37167f756511SDominic Meiser {
37177f756511SDominic Meiser   PetscFunctionBegin;
37187f756511SDominic Meiser   if (*mat) {
37197f756511SDominic Meiser     delete (*mat)->values;
37207f756511SDominic Meiser     delete (*mat)->column_indices;
37217f756511SDominic Meiser     delete (*mat)->row_offsets;
37227f756511SDominic Meiser     delete *mat;
37237f756511SDominic Meiser     *mat = 0;
37247f756511SDominic Meiser   }
37257f756511SDominic Meiser   PetscFunctionReturn(0);
37267f756511SDominic Meiser }
37277f756511SDominic Meiser 
3728470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
37297f756511SDominic Meiser {
37307f756511SDominic Meiser   cusparseStatus_t stat;
37317f756511SDominic Meiser   PetscErrorCode   ierr;
37327f756511SDominic Meiser 
37337f756511SDominic Meiser   PetscFunctionBegin;
37347f756511SDominic Meiser   if (*trifactor) {
373557d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3736afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
37377f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
37381b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
37392cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3740afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
37411b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3742afb2bd1cSJunchao Zhang    #endif
3743da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
37447f756511SDominic Meiser   }
37457f756511SDominic Meiser   PetscFunctionReturn(0);
37467f756511SDominic Meiser }
37477f756511SDominic Meiser 
3748470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
37497f756511SDominic Meiser {
37507f756511SDominic Meiser   CsrMatrix        *mat;
37517f756511SDominic Meiser   cusparseStatus_t stat;
37527f756511SDominic Meiser   cudaError_t      err;
37537f756511SDominic Meiser 
37547f756511SDominic Meiser   PetscFunctionBegin;
37557f756511SDominic Meiser   if (*matstruct) {
37567f756511SDominic Meiser     if ((*matstruct)->mat) {
37577f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3758afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3759afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3760afb2bd1cSJunchao Zhang        #else
37617f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
376257d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3763afb2bd1cSJunchao Zhang        #endif
37647f756511SDominic Meiser       } else {
37657f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
37667f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
37677f756511SDominic Meiser       }
37687f756511SDominic Meiser     }
376957d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
37707f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3771afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
37727656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
37737656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3774afb2bd1cSJunchao Zhang 
3775afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3776afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3777afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3778afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3779afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3780afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3781afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3782afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3783afb2bd1cSJunchao Zhang       }
3784afb2bd1cSJunchao Zhang     }
3785afb2bd1cSJunchao Zhang    #endif
37867f756511SDominic Meiser     delete *matstruct;
37877e8381f9SStefano Zampini     *matstruct = NULL;
37887f756511SDominic Meiser   }
37897f756511SDominic Meiser   PetscFunctionReturn(0);
37907f756511SDominic Meiser }
37917f756511SDominic Meiser 
3792e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37937f756511SDominic Meiser {
3794e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3795e6e9a74fSStefano Zampini 
37967f756511SDominic Meiser   PetscFunctionBegin;
37977f756511SDominic Meiser   if (*trifactors) {
3798e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3799e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3800e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3801e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
38027f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
38037f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
38047f756511SDominic Meiser     delete (*trifactors)->workVector;
38057e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
38067e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
38077e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3808bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3809bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3810e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3811ccdfe979SStefano Zampini   }
3812ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3813ccdfe979SStefano Zampini }
3814ccdfe979SStefano Zampini 
3815ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3816ccdfe979SStefano Zampini {
3817e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3818ccdfe979SStefano Zampini   cusparseHandle_t handle;
3819ccdfe979SStefano Zampini   cusparseStatus_t stat;
3820ccdfe979SStefano Zampini 
3821ccdfe979SStefano Zampini   PetscFunctionBegin;
3822ccdfe979SStefano Zampini   if (*trifactors) {
3823e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
38247f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
382557d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
38267f756511SDominic Meiser     }
3827e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
38287f756511SDominic Meiser   }
38297f756511SDominic Meiser   PetscFunctionReturn(0);
38307f756511SDominic Meiser }
38317e8381f9SStefano Zampini 
38327e8381f9SStefano Zampini struct IJCompare
38337e8381f9SStefano Zampini {
38347e8381f9SStefano Zampini   __host__ __device__
38357e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38367e8381f9SStefano Zampini   {
38377e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
38387e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
38397e8381f9SStefano Zampini     return false;
38407e8381f9SStefano Zampini   }
38417e8381f9SStefano Zampini };
38427e8381f9SStefano Zampini 
38437e8381f9SStefano Zampini struct IJEqual
38447e8381f9SStefano Zampini {
38457e8381f9SStefano Zampini   __host__ __device__
38467e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38477e8381f9SStefano Zampini   {
38487e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
38497e8381f9SStefano Zampini     return true;
38507e8381f9SStefano Zampini   }
38517e8381f9SStefano Zampini };
38527e8381f9SStefano Zampini 
38537e8381f9SStefano Zampini struct IJDiff
38547e8381f9SStefano Zampini {
38557e8381f9SStefano Zampini   __host__ __device__
38567e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38577e8381f9SStefano Zampini   {
38587e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
38597e8381f9SStefano Zampini   }
38607e8381f9SStefano Zampini };
38617e8381f9SStefano Zampini 
38627e8381f9SStefano Zampini struct IJSum
38637e8381f9SStefano Zampini {
38647e8381f9SStefano Zampini   __host__ __device__
38657e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38667e8381f9SStefano Zampini   {
38677e8381f9SStefano Zampini     return t1||t2;
38687e8381f9SStefano Zampini   }
38697e8381f9SStefano Zampini };
38707e8381f9SStefano Zampini 
38717e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3872e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
38737e8381f9SStefano Zampini {
38747e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3875fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3876bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
387708391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
38787e8381f9SStefano Zampini   CsrMatrix                             *matrix;
38797e8381f9SStefano Zampini   PetscErrorCode                        ierr;
38807e8381f9SStefano Zampini   PetscInt                              n;
38817e8381f9SStefano Zampini 
38827e8381f9SStefano Zampini   PetscFunctionBegin;
38837e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
38847e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
38857e8381f9SStefano Zampini   if (!cusp->cooPerm) {
38867e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38877e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38887e8381f9SStefano Zampini     PetscFunctionReturn(0);
38897e8381f9SStefano Zampini   }
38907e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
38917e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3892e61fc153SStefano Zampini   if (!v) {
3893e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3894e61fc153SStefano Zampini     goto finalize;
38957e8381f9SStefano Zampini   }
3896e61fc153SStefano Zampini   n = cusp->cooPerm->size();
389708391a17SStefano Zampini   if (isCudaMem(v)) {
389808391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
389908391a17SStefano Zampini   } else {
3900e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3901e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
390208391a17SStefano Zampini     d_v = cooPerm_v->data();
3903e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
390408391a17SStefano Zampini   }
3905bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3906e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3907ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3908bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
390908391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3910ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3911ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3912ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3913ddea5d60SJunchao Zhang       */
3914e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3915e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3916e61fc153SStefano Zampini       delete cooPerm_w;
39177e8381f9SStefano Zampini     } else {
3918ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
391908391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39207e8381f9SStefano Zampini                                                                 matrix->values->begin()));
392108391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39227e8381f9SStefano Zampini                                                                 matrix->values->end()));
3923ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
39247e8381f9SStefano Zampini     }
39257e8381f9SStefano Zampini   } else {
3926e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
392708391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3928e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
39297e8381f9SStefano Zampini     } else {
393008391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39317e8381f9SStefano Zampini                                                                 matrix->values->begin()));
393208391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39337e8381f9SStefano Zampini                                                                 matrix->values->end()));
39347e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
39357e8381f9SStefano Zampini     }
39367e8381f9SStefano Zampini   }
3937bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3938e61fc153SStefano Zampini finalize:
3939e61fc153SStefano Zampini   delete cooPerm_v;
39407e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3941e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3942fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3943*7d3de750SJacob Faibussowitsch   ierr = PetscInfo(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3944fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3945*7d3de750SJacob Faibussowitsch   ierr = PetscInfo(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3946fcdce8c4SStefano Zampini   a->reallocs         = 0;
3947fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3948fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3949fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3950fcdce8c4SStefano Zampini   A->num_ass++;
39517e8381f9SStefano Zampini   PetscFunctionReturn(0);
39527e8381f9SStefano Zampini }
39537e8381f9SStefano Zampini 
3954a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3955a49f1ed0SStefano Zampini {
3956a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3957a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3958a49f1ed0SStefano Zampini 
3959a49f1ed0SStefano Zampini   PetscFunctionBegin;
3960a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3961a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3962a49f1ed0SStefano Zampini   if (destroy) {
3963a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3964a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3965a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3966a49f1ed0SStefano Zampini   }
39671a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3968a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3969a49f1ed0SStefano Zampini }
3970a49f1ed0SStefano Zampini 
39717e8381f9SStefano Zampini #include <thrust/binary_search.h>
397282a78a4eSJed Brown PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
39737e8381f9SStefano Zampini {
39747e8381f9SStefano Zampini   PetscErrorCode     ierr;
39757e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
39767e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
39777e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
39787e8381f9SStefano Zampini   cudaError_t        cerr;
39797e8381f9SStefano Zampini 
39807e8381f9SStefano Zampini   PetscFunctionBegin;
39817e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
39827e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
39837e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
39847e8381f9SStefano Zampini   if (n != cooPerm_n) {
39857e8381f9SStefano Zampini     delete cusp->cooPerm;
39867e8381f9SStefano Zampini     delete cusp->cooPerm_a;
39877e8381f9SStefano Zampini     cusp->cooPerm = NULL;
39887e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
39897e8381f9SStefano Zampini   }
39907e8381f9SStefano Zampini   if (n) {
39917e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
39927e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39937e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39947e8381f9SStefano Zampini 
39957e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
39967e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39977e8381f9SStefano Zampini 
39987e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
39997e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
40007e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
4001ddea5d60SJunchao Zhang 
4002ddea5d60SJunchao Zhang     /* Ex.
4003ddea5d60SJunchao Zhang       n = 6
4004ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4005ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4006ddea5d60SJunchao Zhang     */
40077e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
40087e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
40097e8381f9SStefano Zampini 
401008391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
40117e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4012ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4013ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
40147e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
40157e8381f9SStefano Zampini 
4016ddea5d60SJunchao Zhang     /*
4017ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4018ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4019ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4020ddea5d60SJunchao Zhang     */
4021ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4022ddea5d60SJunchao Zhang 
4023ddea5d60SJunchao Zhang     /*
4024ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4025ddea5d60SJunchao Zhang                             ^ekey
4026ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4027ddea5d60SJunchao Zhang                            ^nekye
4028ddea5d60SJunchao Zhang     */
40297e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
40307e8381f9SStefano Zampini       delete cusp->cooPerm_a;
40317e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4032ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4033ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4034ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4035ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4036ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
40377e8381f9SStefano Zampini       w[0] = 0;
4038ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4039ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
40407e8381f9SStefano Zampini     }
40417e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4042ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4043ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4044ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
404508391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40467e8381f9SStefano Zampini 
40477e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
40487e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
40497e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
40507e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
40517e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4052ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
40537e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40547e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4055fcdce8c4SStefano Zampini     a->rmax = 0;
40567e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
40577e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
40587e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40597e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
40607e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
40617e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
40627e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
40637e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
40647e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4065fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
40667e8381f9SStefano Zampini     }
4067fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
40687e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
40697e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4070fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
40717e8381f9SStefano Zampini   } else {
40727e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
40737e8381f9SStefano Zampini   }
4074e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
40757e8381f9SStefano Zampini 
40767e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4077e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
4078e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
40797e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
40807e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
40817e8381f9SStefano Zampini   A->nonzerostate++;
40827e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4083a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
40847e8381f9SStefano Zampini 
40857e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
40867e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
40877e8381f9SStefano Zampini   PetscFunctionReturn(0);
40887e8381f9SStefano Zampini }
4089ed502f03SStefano Zampini 
40905b7e41feSStefano Zampini /*@C
40915b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40925b7e41feSStefano Zampini 
40935b7e41feSStefano Zampini    Not collective
40945b7e41feSStefano Zampini 
40955b7e41feSStefano Zampini     Input Parameters:
40965b7e41feSStefano Zampini +   A - the matrix
40975b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40985b7e41feSStefano Zampini 
40995b7e41feSStefano Zampini     Output Parameters:
41005b7e41feSStefano Zampini +   ia - the CSR row pointers
41015b7e41feSStefano Zampini -   ja - the CSR column indices
41025b7e41feSStefano Zampini 
41035b7e41feSStefano Zampini     Level: developer
41045b7e41feSStefano Zampini 
41055b7e41feSStefano Zampini     Notes:
41065b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
41075b7e41feSStefano Zampini 
41085b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
41095b7e41feSStefano Zampini @*/
41105f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41115f101d05SStefano Zampini {
41125f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
41135f101d05SStefano Zampini   CsrMatrix          *csr;
41145f101d05SStefano Zampini   PetscErrorCode     ierr;
41155f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
41165f101d05SStefano Zampini 
41175f101d05SStefano Zampini   PetscFunctionBegin;
41185f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41195f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41205f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41215f101d05SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41225f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
41235f101d05SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41245f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41255f101d05SStefano Zampini   if (i) {
41265f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41275f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41285f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41295f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41305f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
41315f101d05SStefano Zampini       }
41325f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41335f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41345f101d05SStefano Zampini   }
41355f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41365f101d05SStefano Zampini   PetscFunctionReturn(0);
41375f101d05SStefano Zampini }
41385f101d05SStefano Zampini 
41395b7e41feSStefano Zampini /*@C
41405b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41415b7e41feSStefano Zampini 
41425b7e41feSStefano Zampini    Not collective
41435b7e41feSStefano Zampini 
41445b7e41feSStefano Zampini     Input Parameters:
41455b7e41feSStefano Zampini +   A - the matrix
41465b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41475b7e41feSStefano Zampini 
41485b7e41feSStefano Zampini     Output Parameters:
41495b7e41feSStefano Zampini +   ia - the CSR row pointers
41505b7e41feSStefano Zampini -   ja - the CSR column indices
41515b7e41feSStefano Zampini 
41525b7e41feSStefano Zampini     Level: developer
41535b7e41feSStefano Zampini 
41545b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41555b7e41feSStefano Zampini @*/
41565f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41575f101d05SStefano Zampini {
41585f101d05SStefano Zampini   PetscFunctionBegin;
41595f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41605f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41615f101d05SStefano Zampini   if (i) *i = NULL;
41625f101d05SStefano Zampini   if (j) *j = NULL;
41635f101d05SStefano Zampini   PetscFunctionReturn(0);
41645f101d05SStefano Zampini }
41655f101d05SStefano Zampini 
41665b7e41feSStefano Zampini /*@C
41675b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41685b7e41feSStefano Zampini 
41695b7e41feSStefano Zampini    Not Collective
41705b7e41feSStefano Zampini 
41715b7e41feSStefano Zampini    Input Parameter:
41725b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41735b7e41feSStefano Zampini 
41745b7e41feSStefano Zampini    Output Parameter:
41755b7e41feSStefano Zampini .   a - pointer to the device data
41765b7e41feSStefano Zampini 
41775b7e41feSStefano Zampini    Level: developer
41785b7e41feSStefano Zampini 
41795b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41805b7e41feSStefano Zampini 
41815b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41825b7e41feSStefano Zampini @*/
4183ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4184ed502f03SStefano Zampini {
4185ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4186ed502f03SStefano Zampini   CsrMatrix          *csr;
4187ed502f03SStefano Zampini   PetscErrorCode     ierr;
4188ed502f03SStefano Zampini 
4189ed502f03SStefano Zampini   PetscFunctionBegin;
4190ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4191ed502f03SStefano Zampini   PetscValidPointer(a,2);
4192ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4193ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4194ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
419533c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4196ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4197ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4198ed502f03SStefano Zampini   *a = csr->values->data().get();
4199ed502f03SStefano Zampini   PetscFunctionReturn(0);
4200ed502f03SStefano Zampini }
4201ed502f03SStefano Zampini 
42025b7e41feSStefano Zampini /*@C
42035b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
42045b7e41feSStefano Zampini 
42055b7e41feSStefano Zampini    Not Collective
42065b7e41feSStefano Zampini 
42075b7e41feSStefano Zampini    Input Parameter:
42085b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42095b7e41feSStefano Zampini 
42105b7e41feSStefano Zampini    Output Parameter:
42115b7e41feSStefano Zampini .   a - pointer to the device data
42125b7e41feSStefano Zampini 
42135b7e41feSStefano Zampini    Level: developer
42145b7e41feSStefano Zampini 
42155b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
42165b7e41feSStefano Zampini @*/
4217ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4218ed502f03SStefano Zampini {
4219ed502f03SStefano Zampini   PetscFunctionBegin;
4220ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4221ed502f03SStefano Zampini   PetscValidPointer(a,2);
4222ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4223ed502f03SStefano Zampini   *a = NULL;
4224ed502f03SStefano Zampini   PetscFunctionReturn(0);
4225ed502f03SStefano Zampini }
4226ed502f03SStefano Zampini 
42275b7e41feSStefano Zampini /*@C
42285b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42295b7e41feSStefano Zampini 
42305b7e41feSStefano Zampini    Not Collective
42315b7e41feSStefano Zampini 
42325b7e41feSStefano Zampini    Input Parameter:
42335b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42345b7e41feSStefano Zampini 
42355b7e41feSStefano Zampini    Output Parameter:
42365b7e41feSStefano Zampini .   a - pointer to the device data
42375b7e41feSStefano Zampini 
42385b7e41feSStefano Zampini    Level: developer
42395b7e41feSStefano Zampini 
42405b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42415b7e41feSStefano Zampini 
42425b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
42435b7e41feSStefano Zampini @*/
4244039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4245039c6fbaSStefano Zampini {
4246039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4247039c6fbaSStefano Zampini   CsrMatrix          *csr;
4248039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4249039c6fbaSStefano Zampini 
4250039c6fbaSStefano Zampini   PetscFunctionBegin;
4251039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4252039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4253039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4254039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4255039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
425633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4257039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4258039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4259039c6fbaSStefano Zampini   *a = csr->values->data().get();
4260039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4261a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4262039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4263039c6fbaSStefano Zampini }
42645b7e41feSStefano Zampini /*@C
42655b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4266039c6fbaSStefano Zampini 
42675b7e41feSStefano Zampini    Not Collective
42685b7e41feSStefano Zampini 
42695b7e41feSStefano Zampini    Input Parameter:
42705b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42715b7e41feSStefano Zampini 
42725b7e41feSStefano Zampini    Output Parameter:
42735b7e41feSStefano Zampini .   a - pointer to the device data
42745b7e41feSStefano Zampini 
42755b7e41feSStefano Zampini    Level: developer
42765b7e41feSStefano Zampini 
42775b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42785b7e41feSStefano Zampini @*/
4279039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4280039c6fbaSStefano Zampini {
4281039c6fbaSStefano Zampini   PetscErrorCode ierr;
4282039c6fbaSStefano Zampini 
4283039c6fbaSStefano Zampini   PetscFunctionBegin;
4284039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4285039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4286039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4287039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4288039c6fbaSStefano Zampini   *a = NULL;
4289039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4290039c6fbaSStefano Zampini }
4291039c6fbaSStefano Zampini 
42925b7e41feSStefano Zampini /*@C
42935b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42945b7e41feSStefano Zampini 
42955b7e41feSStefano Zampini    Not Collective
42965b7e41feSStefano Zampini 
42975b7e41feSStefano Zampini    Input Parameter:
42985b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42995b7e41feSStefano Zampini 
43005b7e41feSStefano Zampini    Output Parameter:
43015b7e41feSStefano Zampini .   a - pointer to the device data
43025b7e41feSStefano Zampini 
43035b7e41feSStefano Zampini    Level: developer
43045b7e41feSStefano Zampini 
43055b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
43065b7e41feSStefano Zampini 
43075b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
43085b7e41feSStefano Zampini @*/
4309ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4310ed502f03SStefano Zampini {
4311ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4312ed502f03SStefano Zampini   CsrMatrix          *csr;
4313a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4314ed502f03SStefano Zampini 
4315ed502f03SStefano Zampini   PetscFunctionBegin;
4316ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4317ed502f03SStefano Zampini   PetscValidPointer(a,2);
4318ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4319ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
432033c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4321ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4322ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4323ed502f03SStefano Zampini   *a = csr->values->data().get();
4324039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4325a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4326ed502f03SStefano Zampini   PetscFunctionReturn(0);
4327ed502f03SStefano Zampini }
4328ed502f03SStefano Zampini 
43295b7e41feSStefano Zampini /*@C
43305b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43315b7e41feSStefano Zampini 
43325b7e41feSStefano Zampini    Not Collective
43335b7e41feSStefano Zampini 
43345b7e41feSStefano Zampini    Input Parameter:
43355b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43365b7e41feSStefano Zampini 
43375b7e41feSStefano Zampini    Output Parameter:
43385b7e41feSStefano Zampini .   a - pointer to the device data
43395b7e41feSStefano Zampini 
43405b7e41feSStefano Zampini    Level: developer
43415b7e41feSStefano Zampini 
43425b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
43435b7e41feSStefano Zampini @*/
4344ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4345ed502f03SStefano Zampini {
4346ed502f03SStefano Zampini   PetscErrorCode ierr;
4347ed502f03SStefano Zampini 
4348ed502f03SStefano Zampini   PetscFunctionBegin;
4349ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4350ed502f03SStefano Zampini   PetscValidPointer(a,2);
4351ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4352ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4353ed502f03SStefano Zampini   *a = NULL;
4354ed502f03SStefano Zampini   PetscFunctionReturn(0);
4355ed502f03SStefano Zampini }
4356ed502f03SStefano Zampini 
4357ed502f03SStefano Zampini struct IJCompare4
4358ed502f03SStefano Zampini {
4359ed502f03SStefano Zampini   __host__ __device__
43602ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4361ed502f03SStefano Zampini   {
4362ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4363ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4364ed502f03SStefano Zampini     return false;
4365ed502f03SStefano Zampini   }
4366ed502f03SStefano Zampini };
4367ed502f03SStefano Zampini 
43688909a122SStefano Zampini struct Shift
43698909a122SStefano Zampini {
4370ed502f03SStefano Zampini   int _shift;
4371ed502f03SStefano Zampini 
4372ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4373ed502f03SStefano Zampini   __host__ __device__
4374ed502f03SStefano Zampini   inline int operator() (const int &c)
4375ed502f03SStefano Zampini   {
4376ed502f03SStefano Zampini     return c + _shift;
4377ed502f03SStefano Zampini   }
4378ed502f03SStefano Zampini };
4379ed502f03SStefano Zampini 
4380ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4381ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4382ed502f03SStefano Zampini {
4383ed502f03SStefano Zampini   PetscErrorCode               ierr;
4384ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4385ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4386ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4387ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4388ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4389ed502f03SStefano Zampini   cusparseStatus_t             stat;
4390ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4391ed502f03SStefano Zampini   cudaError_t                  cerr;
4392ed502f03SStefano Zampini 
4393ed502f03SStefano Zampini   PetscFunctionBegin;
4394ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4395ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4396ed502f03SStefano Zampini   PetscValidPointer(C,4);
4397ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4398ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
439998921bdaSJacob Faibussowitsch   if (A->rmap->n != B->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4400ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4401ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4402ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4403ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4404ed502f03SStefano Zampini     m     = A->rmap->n;
4405ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4406ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4407ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4408ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4409ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4410ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4411ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4412ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4413ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4414ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4415ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4416ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4417ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4418ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4419ed502f03SStefano Zampini     Ccusp->nrows    = m;
4420ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4421ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4422ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4423ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4424ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4425ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4426ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4427ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4428ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4429ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4430ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4431ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4432ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4433ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4434ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4435ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4436ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4437ed502f03SStefano Zampini 
4438ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4439ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4440ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4441ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4442ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4443ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4444ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4445ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4446ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4447ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4448ed502f03SStefano Zampini     if (c->nz) {
44492ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44502ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44512ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44522ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44532ed87e7eSStefano Zampini 
4454ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4455ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4456ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4457ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4458ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4459ed502f03SStefano Zampini         }
44602ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44612ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4462ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4463ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4464ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4465ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4466ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4467ed502f03SStefano Zampini         }
44682ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44692ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4470ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
44712ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44722ed87e7eSStefano Zampini                               Aroff->data().get(),
44732ed87e7eSStefano Zampini                               Annz,
44742ed87e7eSStefano Zampini                               m,
44752ed87e7eSStefano Zampini                               Acoo->data().get(),
44762ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4477ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44782ed87e7eSStefano Zampini                               Broff->data().get(),
4479ed502f03SStefano Zampini                               Bnnz,
4480ed502f03SStefano Zampini                               m,
44812ed87e7eSStefano Zampini                               Bcoo->data().get(),
4482ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
44832ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44842ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44852ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44868909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4487ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4488ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44898909a122SStefano Zampini #else
44908909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44918909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44928909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44938909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44948909a122SStefano Zampini #endif
44952ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44962ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44972ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44982ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
44992ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
45002ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4501ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4502ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4503ed502f03SStefano Zampini       thrust::advance(p2,Annz);
45042ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
45058909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
45068909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
45078909a122SStefano Zampini #endif
45082ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
45092ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
45102ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
45112ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
45122ed87e7eSStefano Zampini #else
45132ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
45142ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
45152ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
45162ed87e7eSStefano Zampini #endif
4517ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
45182ed87e7eSStefano Zampini                               Ccoo->data().get(),
4519ed502f03SStefano Zampini                               c->nz,
4520ed502f03SStefano Zampini                               m,
4521ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4522ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4523ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
45242ed87e7eSStefano Zampini       delete wPerm;
45252ed87e7eSStefano Zampini       delete Acoo;
45262ed87e7eSStefano Zampini       delete Bcoo;
45272ed87e7eSStefano Zampini       delete Ccoo;
4528ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4529ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4530ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4531ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4532ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4533ed502f03SStefano Zampini #endif
45341a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45353606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
45363606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4537ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4538ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4539ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4540ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4541ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4542ed502f03SStefano Zampini 
45431a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45441a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4545a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4546ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4547ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4548ed502f03SStefano Zampini         CcsrT->num_rows = n;
4549ed502f03SStefano Zampini         CcsrT->num_cols = m;
4550ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4551ed502f03SStefano Zampini 
4552ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4553ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4554ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4555ed502f03SStefano Zampini 
4556ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4557ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4558ed502f03SStefano Zampini         if (AT) {
4559ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4560ed502f03SStefano Zampini           thrust::advance(rT,-1);
4561ed502f03SStefano Zampini         }
4562ed502f03SStefano Zampini         if (BT) {
4563ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4564ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4565ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4566ed502f03SStefano Zampini         }
4567ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4568ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4569ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4570ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4571ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4572ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4573ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4574ed502f03SStefano Zampini 
4575ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4576ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4577ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4578ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4579ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4580ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4581ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4582ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4583ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4584ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4585ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4586ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4587ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4588ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4589ed502f03SStefano Zampini #endif
4590ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4591ed502f03SStefano Zampini       }
4592ed502f03SStefano Zampini     }
4593ed502f03SStefano Zampini 
4594ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4595ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4596ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4597ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4598ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4599ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4600ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4601ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4602ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4603ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4604ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4605ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606ed502f03SStefano Zampini     } else {
4607ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4608ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4609ed502f03SStefano Zampini     }
4610ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4611ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4612ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4613ed502f03SStefano Zampini     c->maxnz = c->nz;
4614ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4615ed502f03SStefano Zampini     c->rmax = 0;
4616ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4617ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4618ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4619ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4620ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4621ed502f03SStefano Zampini     }
4622ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4623ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4624ed502f03SStefano Zampini     (*C)->nonzerostate++;
4625ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4626ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4627ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4628ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4629ed502f03SStefano Zampini   } else {
463098921bdaSJacob Faibussowitsch     if ((*C)->rmap->n != B->rmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4631ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4632ed502f03SStefano Zampini     if (c->nz) {
4633ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4634ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4635ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4636ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4637ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4638ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4639ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4640ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4641ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4642ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4643ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
464498921bdaSJacob Faibussowitsch       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
464598921bdaSJacob Faibussowitsch       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
464698921bdaSJacob Faibussowitsch       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
464798921bdaSJacob Faibussowitsch       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
464898921bdaSJacob Faibussowitsch       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4649ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4650ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4651ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4652ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4653ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4654ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4655ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4656ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4657ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4658ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4659ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4660ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4661ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4662a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
46631a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4664ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4665ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4666ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4667ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4668ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4669ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4670ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4671ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46721a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4673ed502f03SStefano Zampini       }
4674ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4675ed502f03SStefano Zampini     }
4676ed502f03SStefano Zampini   }
4677ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4678ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4679ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4680ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4681ed502f03SStefano Zampini   PetscFunctionReturn(0);
4682ed502f03SStefano Zampini }
4683c215019aSStefano Zampini 
4684c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4685c215019aSStefano Zampini {
4686c215019aSStefano Zampini   PetscErrorCode    ierr;
4687c215019aSStefano Zampini   bool              dmem;
4688c215019aSStefano Zampini   const PetscScalar *av;
4689c215019aSStefano Zampini   cudaError_t       cerr;
4690c215019aSStefano Zampini 
4691c215019aSStefano Zampini   PetscFunctionBegin;
4692c215019aSStefano Zampini   dmem = isCudaMem(v);
4693c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4694c215019aSStefano Zampini   if (n && idx) {
4695c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4696c215019aSStefano Zampini     widx.assign(idx,idx+n);
4697c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4698c215019aSStefano Zampini 
4699c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4700c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4701c215019aSStefano Zampini     if (dmem) {
4702c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4703c215019aSStefano Zampini     } else {
4704c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4705c215019aSStefano Zampini       dv = w->data();
4706c215019aSStefano Zampini     }
4707c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4708c215019aSStefano Zampini 
4709c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4710c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4711c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4712c215019aSStefano Zampini     if (w) {
4713c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4714c215019aSStefano Zampini     }
4715c215019aSStefano Zampini     delete w;
4716c215019aSStefano Zampini   } else {
4717c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4718c215019aSStefano Zampini   }
4719c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4720c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4721c215019aSStefano Zampini   PetscFunctionReturn(0);
4722c215019aSStefano Zampini }
4723