19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16bc3f50f2SPaul Mullowney 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure); 686fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 716fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 74e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 759ae82921SPaul Mullowney 767f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 79ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 827f756511SDominic Meiser 8357181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 8557181aedSStefano Zampini 867e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 877e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 887e8381f9SStefano Zampini 89b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 90b06137fdSPaul Mullowney { 91b06137fdSPaul Mullowney cusparseStatus_t stat; 92b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 93b06137fdSPaul Mullowney 94b06137fdSPaul Mullowney PetscFunctionBegin; 95d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 96b06137fdSPaul Mullowney cusparsestruct->stream = stream; 9757d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 98b06137fdSPaul Mullowney PetscFunctionReturn(0); 99b06137fdSPaul Mullowney } 100b06137fdSPaul Mullowney 101b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 102b06137fdSPaul Mullowney { 103b06137fdSPaul Mullowney cusparseStatus_t stat; 104b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 105b06137fdSPaul Mullowney 106b06137fdSPaul Mullowney PetscFunctionBegin; 107d98d7c49SStefano Zampini if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 1086b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 10916a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 11057d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 11116a2e217SAlejandro Lamas Daviña } 112b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1136b1cf21dSAlejandro Lamas Daviña } 11457d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 115b06137fdSPaul Mullowney PetscFunctionReturn(0); 116b06137fdSPaul Mullowney } 117b06137fdSPaul Mullowney 118b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 119b06137fdSPaul Mullowney { 120b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1217e8381f9SStefano Zampini PetscBool flg; 1227e8381f9SStefano Zampini PetscErrorCode ierr; 123ccdfe979SStefano Zampini 124b06137fdSPaul Mullowney PetscFunctionBegin; 1257e8381f9SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1267e8381f9SStefano Zampini if (!flg || !cusparsestruct) PetscFunctionReturn(0); 127ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 128b06137fdSPaul Mullowney PetscFunctionReturn(0); 129b06137fdSPaul Mullowney } 130b06137fdSPaul Mullowney 131ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1329ae82921SPaul Mullowney { 1339ae82921SPaul Mullowney PetscFunctionBegin; 1349ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1359ae82921SPaul Mullowney PetscFunctionReturn(0); 1369ae82921SPaul Mullowney } 1379ae82921SPaul Mullowney 138c708e6cdSJed Brown /*MC 139087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 140087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 141087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 142087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 143087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 144087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 145c708e6cdSJed Brown 1469ae82921SPaul Mullowney Level: beginner 147c708e6cdSJed Brown 1483ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 149c708e6cdSJed Brown M*/ 1509ae82921SPaul Mullowney 15142c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1529ae82921SPaul Mullowney { 1539ae82921SPaul Mullowney PetscErrorCode ierr; 154bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1559ae82921SPaul Mullowney 1569ae82921SPaul Mullowney PetscFunctionBegin; 157bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 158bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1592c7c0729SBarry Smith (*B)->factortype = ftype; 1602c7c0729SBarry Smith (*B)->useordering = PETSC_TRUE; 1619ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1622205254eSKarl Rupp 163087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 16433d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1659ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1669ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 167087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 168087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 169087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1709ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 171bc3f50f2SPaul Mullowney 172fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1733ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1749ae82921SPaul Mullowney PetscFunctionReturn(0); 1759ae82921SPaul Mullowney } 1769ae82921SPaul Mullowney 177bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 178ca45077fSPaul Mullowney { 179aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1806e111a19SKarl Rupp 181ca45077fSPaul Mullowney PetscFunctionBegin; 182ca45077fSPaul Mullowney switch (op) { 183e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 184aa372e3fSPaul Mullowney cusparsestruct->format = format; 185ca45077fSPaul Mullowney break; 186e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 187aa372e3fSPaul Mullowney cusparsestruct->format = format; 188ca45077fSPaul Mullowney break; 189ca45077fSPaul Mullowney default: 19036d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 191ca45077fSPaul Mullowney } 192ca45077fSPaul Mullowney PetscFunctionReturn(0); 193ca45077fSPaul Mullowney } 1949ae82921SPaul Mullowney 195e057df02SPaul Mullowney /*@ 196e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 197e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 198aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 199e057df02SPaul Mullowney Not Collective 200e057df02SPaul Mullowney 201e057df02SPaul Mullowney Input Parameters: 2028468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 20336d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 2042692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 205e057df02SPaul Mullowney 206e057df02SPaul Mullowney Output Parameter: 207e057df02SPaul Mullowney 208e057df02SPaul Mullowney Level: intermediate 209e057df02SPaul Mullowney 2108468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 211e057df02SPaul Mullowney @*/ 212e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 213e057df02SPaul Mullowney { 214e057df02SPaul Mullowney PetscErrorCode ierr; 2156e111a19SKarl Rupp 216e057df02SPaul Mullowney PetscFunctionBegin; 217e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 218e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 219e057df02SPaul Mullowney PetscFunctionReturn(0); 220e057df02SPaul Mullowney } 221e057df02SPaul Mullowney 222e6e9a74fSStefano Zampini /*@ 223*e589036eSStefano Zampini MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 224e6e9a74fSStefano Zampini 225e6e9a74fSStefano Zampini Collective on mat 226e6e9a74fSStefano Zampini 227e6e9a74fSStefano Zampini Input Parameters: 228e6e9a74fSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE 229e6e9a74fSStefano Zampini - transgen - the boolean flag 230e6e9a74fSStefano Zampini 231e6e9a74fSStefano Zampini Level: intermediate 232e6e9a74fSStefano Zampini 233*e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose() 234e6e9a74fSStefano Zampini @*/ 235e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 236e6e9a74fSStefano Zampini { 237e6e9a74fSStefano Zampini PetscErrorCode ierr; 238e6e9a74fSStefano Zampini PetscBool flg; 239e6e9a74fSStefano Zampini 240e6e9a74fSStefano Zampini PetscFunctionBegin; 241e6e9a74fSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 242e6e9a74fSStefano Zampini ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 243e6e9a74fSStefano Zampini if (flg) { 244e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 24554da937aSStefano Zampini 246e6e9a74fSStefano Zampini if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 247e6e9a74fSStefano Zampini cusp->transgen = transgen; 24854da937aSStefano Zampini if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 24954da937aSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 25054da937aSStefano Zampini } 251e6e9a74fSStefano Zampini } 252e6e9a74fSStefano Zampini PetscFunctionReturn(0); 253e6e9a74fSStefano Zampini } 254e6e9a74fSStefano Zampini 2554416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2569ae82921SPaul Mullowney { 2579ae82921SPaul Mullowney PetscErrorCode ierr; 258e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2599ae82921SPaul Mullowney PetscBool flg; 260a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2616e111a19SKarl Rupp 2629ae82921SPaul Mullowney PetscFunctionBegin; 263e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2649ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 26554da937aSStefano Zampini PetscBool transgen = cusparsestruct->transgen; 26654da937aSStefano Zampini 26754da937aSStefano Zampini ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 268afb2bd1cSJunchao Zhang if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 269afb2bd1cSJunchao Zhang 270e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 271a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 272afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 273afb2bd1cSJunchao Zhang 2744c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 275a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 276afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 277afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 278afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 279afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 280afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 281afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 282afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 283afb2bd1cSJunchao Zhang 284afb2bd1cSJunchao Zhang cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 285afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 286afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 287afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 288afb2bd1cSJunchao Zhang 289afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 290afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 291afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 292afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 293afb2bd1cSJunchao Zhang #endif 2944c87dfd4SPaul Mullowney } 2950af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 2969ae82921SPaul Mullowney PetscFunctionReturn(0); 2979ae82921SPaul Mullowney } 2989ae82921SPaul Mullowney 2996fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3009ae82921SPaul Mullowney { 301da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3029ae82921SPaul Mullowney PetscErrorCode ierr; 3039ae82921SPaul Mullowney 3049ae82921SPaul Mullowney PetscFunctionBegin; 305da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3069ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3079ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3089ae82921SPaul Mullowney PetscFunctionReturn(0); 3099ae82921SPaul Mullowney } 3109ae82921SPaul Mullowney 3116fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3129ae82921SPaul Mullowney { 313da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 3149ae82921SPaul Mullowney PetscErrorCode ierr; 3159ae82921SPaul Mullowney 3169ae82921SPaul Mullowney PetscFunctionBegin; 317da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 3189ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3199ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3209ae82921SPaul Mullowney PetscFunctionReturn(0); 3219ae82921SPaul Mullowney } 3229ae82921SPaul Mullowney 323087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 324087f3262SPaul Mullowney { 325da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 326087f3262SPaul Mullowney PetscErrorCode ierr; 327087f3262SPaul Mullowney 328087f3262SPaul Mullowney PetscFunctionBegin; 329da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 330087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 331087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 332087f3262SPaul Mullowney PetscFunctionReturn(0); 333087f3262SPaul Mullowney } 334087f3262SPaul Mullowney 335087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 336087f3262SPaul Mullowney { 337da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr; 338087f3262SPaul Mullowney PetscErrorCode ierr; 339087f3262SPaul Mullowney 340087f3262SPaul Mullowney PetscFunctionBegin; 341da79fbbcSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 342087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 343087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 344087f3262SPaul Mullowney PetscFunctionReturn(0); 345087f3262SPaul Mullowney } 346087f3262SPaul Mullowney 347087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3489ae82921SPaul Mullowney { 3499ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3509ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3519ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 352aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3539ae82921SPaul Mullowney cusparseStatus_t stat; 3549ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3559ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3569ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3579ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 358b175d8bbSPaul Mullowney PetscErrorCode ierr; 35957d48284SJunchao Zhang cudaError_t cerr; 3609ae82921SPaul Mullowney 3619ae82921SPaul Mullowney PetscFunctionBegin; 362cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 363c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3649ae82921SPaul Mullowney try { 3659ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3669ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 367da79fbbcSStefano Zampini if (!loTriFactor) { 3682cbc15d9SMark PetscScalar *AALo; 3692cbc15d9SMark 3702cbc15d9SMark cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3719ae82921SPaul Mullowney 3729ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 37357d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 37457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 3759ae82921SPaul Mullowney 3769ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3779ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3789ae82921SPaul Mullowney AiLo[n] = nzLower; 3799ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3809ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3819ae82921SPaul Mullowney v = aa; 3829ae82921SPaul Mullowney vi = aj; 3839ae82921SPaul Mullowney offset = 1; 3849ae82921SPaul Mullowney rowOffset= 1; 3859ae82921SPaul Mullowney for (i=1; i<n; i++) { 3869ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 387e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3889ae82921SPaul Mullowney AiLo[i] = rowOffset; 3899ae82921SPaul Mullowney rowOffset += nz+1; 3909ae82921SPaul Mullowney 391580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 392580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 3939ae82921SPaul Mullowney 3949ae82921SPaul Mullowney offset += nz; 3959ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3969ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3979ae82921SPaul Mullowney offset += 1; 3989ae82921SPaul Mullowney 3999ae82921SPaul Mullowney v += nz; 4009ae82921SPaul Mullowney vi += nz; 4019ae82921SPaul Mullowney } 4022205254eSKarl Rupp 403aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 404da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 405da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 406aa372e3fSPaul Mullowney /* Create the matrix description */ 40757d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 40857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 4091b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 410afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 411afb2bd1cSJunchao Zhang #else 41257d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 413afb2bd1cSJunchao Zhang #endif 41457d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 41557d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 416aa372e3fSPaul Mullowney 417aa372e3fSPaul Mullowney /* set the operation */ 418aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 419aa372e3fSPaul Mullowney 420aa372e3fSPaul Mullowney /* set the matrix */ 421aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 422aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 423aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 424aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 425aa372e3fSPaul Mullowney 426aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 427aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 428aa372e3fSPaul Mullowney 429aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 430aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 431aa372e3fSPaul Mullowney 432aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 433aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 434aa372e3fSPaul Mullowney 435afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 436da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 437afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 4381b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 439afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 440afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 441afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 442afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 443afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 444afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 445afb2bd1cSJunchao Zhang #endif 446afb2bd1cSJunchao Zhang 447aa372e3fSPaul Mullowney /* perform the solve analysis */ 448aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 449aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 450aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 451afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 4521b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 453afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 454afb2bd1cSJunchao Zhang #endif 455afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 456da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 457da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 458aa372e3fSPaul Mullowney 459da79fbbcSStefano Zampini /* assign the pointer */ 460aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4612cbc15d9SMark loTriFactor->AA_h = AALo; 46257d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 46357d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 4644863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 465da79fbbcSStefano Zampini } else { /* update values only */ 4662cbc15d9SMark if (!loTriFactor->AA_h) { 4672cbc15d9SMark cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 4682cbc15d9SMark } 469da79fbbcSStefano Zampini /* Fill the lower triangular matrix */ 4702cbc15d9SMark loTriFactor->AA_h[0] = 1.0; 471da79fbbcSStefano Zampini v = aa; 472da79fbbcSStefano Zampini vi = aj; 473da79fbbcSStefano Zampini offset = 1; 474da79fbbcSStefano Zampini for (i=1; i<n; i++) { 475da79fbbcSStefano Zampini nz = ai[i+1] - ai[i]; 4762cbc15d9SMark ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr); 477da79fbbcSStefano Zampini offset += nz; 4782cbc15d9SMark loTriFactor->AA_h[offset] = 1.0; 479da79fbbcSStefano Zampini offset += 1; 480da79fbbcSStefano Zampini v += nz; 481da79fbbcSStefano Zampini } 4822cbc15d9SMark loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower); 483da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 484da79fbbcSStefano Zampini } 4859ae82921SPaul Mullowney } catch(char *ex) { 4869ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4879ae82921SPaul Mullowney } 4889ae82921SPaul Mullowney } 4899ae82921SPaul Mullowney PetscFunctionReturn(0); 4909ae82921SPaul Mullowney } 4919ae82921SPaul Mullowney 492087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4939ae82921SPaul Mullowney { 4949ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4959ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4969ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 497aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4989ae82921SPaul Mullowney cusparseStatus_t stat; 4999ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 5009ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 5019ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 5029ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 5039ae82921SPaul Mullowney PetscErrorCode ierr; 50457d48284SJunchao Zhang cudaError_t cerr; 5059ae82921SPaul Mullowney 5069ae82921SPaul Mullowney PetscFunctionBegin; 507cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 508c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 5099ae82921SPaul Mullowney try { 5109ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 5119ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 512da79fbbcSStefano Zampini if (!upTriFactor) { 5132cbc15d9SMark PetscScalar *AAUp; 5142cbc15d9SMark 5152cbc15d9SMark cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 5162cbc15d9SMark 5179ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 51857d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 51957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 5209ae82921SPaul Mullowney 5219ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 5229ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 5239ae82921SPaul Mullowney AiUp[n]=nzUpper; 5249ae82921SPaul Mullowney offset = nzUpper; 5259ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 5269ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 5279ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 5289ae82921SPaul Mullowney 529e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 5309ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 5319ae82921SPaul Mullowney 532e057df02SPaul Mullowney /* decrement the offset */ 5339ae82921SPaul Mullowney offset -= (nz+1); 5349ae82921SPaul Mullowney 535e057df02SPaul Mullowney /* first, set the diagonal elements */ 5369ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 53709f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 5389ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 5399ae82921SPaul Mullowney 540580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 541580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 5429ae82921SPaul Mullowney } 5432205254eSKarl Rupp 544aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 545da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 546da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 5472205254eSKarl Rupp 548aa372e3fSPaul Mullowney /* Create the matrix description */ 54957d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 55057d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 5511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 552afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 553afb2bd1cSJunchao Zhang #else 55457d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 555afb2bd1cSJunchao Zhang #endif 55657d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 55757d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 558aa372e3fSPaul Mullowney 559aa372e3fSPaul Mullowney /* set the operation */ 560aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 561aa372e3fSPaul Mullowney 562aa372e3fSPaul Mullowney /* set the matrix */ 563aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 564aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 565aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 566aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 567aa372e3fSPaul Mullowney 568aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 569aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 570aa372e3fSPaul Mullowney 571aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 572aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 573aa372e3fSPaul Mullowney 574aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 575aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 576aa372e3fSPaul Mullowney 577afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 578da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 579afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 5801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 581afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 582afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 583afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 584afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 585afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 586afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 587afb2bd1cSJunchao Zhang #endif 588afb2bd1cSJunchao Zhang 589aa372e3fSPaul Mullowney /* perform the solve analysis */ 590aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 591aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 592aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 593afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 5941b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 595afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 596afb2bd1cSJunchao Zhang #endif 597afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 598da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 599da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 600aa372e3fSPaul Mullowney 601da79fbbcSStefano Zampini /* assign the pointer */ 602aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 6032cbc15d9SMark upTriFactor->AA_h = AAUp; 60457d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 60557d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 6064863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 607da79fbbcSStefano Zampini } else { 6082cbc15d9SMark if (!upTriFactor->AA_h) { 6092cbc15d9SMark cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 6102cbc15d9SMark } 611da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 612da79fbbcSStefano Zampini offset = nzUpper; 613da79fbbcSStefano Zampini for (i=n-1; i>=0; i--) { 614da79fbbcSStefano Zampini v = aa + adiag[i+1] + 1; 615da79fbbcSStefano Zampini 616da79fbbcSStefano Zampini /* number of elements NOT on the diagonal */ 617da79fbbcSStefano Zampini nz = adiag[i] - adiag[i+1]-1; 618da79fbbcSStefano Zampini 619da79fbbcSStefano Zampini /* decrement the offset */ 620da79fbbcSStefano Zampini offset -= (nz+1); 621da79fbbcSStefano Zampini 622da79fbbcSStefano Zampini /* first, set the diagonal elements */ 6232cbc15d9SMark upTriFactor->AA_h[offset] = 1./v[nz]; 6242cbc15d9SMark ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr); 625da79fbbcSStefano Zampini } 6262cbc15d9SMark upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper); 627da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 628da79fbbcSStefano Zampini } 6299ae82921SPaul Mullowney } catch(char *ex) { 6309ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 6319ae82921SPaul Mullowney } 6329ae82921SPaul Mullowney } 6339ae82921SPaul Mullowney PetscFunctionReturn(0); 6349ae82921SPaul Mullowney } 6359ae82921SPaul Mullowney 636087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 6379ae82921SPaul Mullowney { 6389ae82921SPaul Mullowney PetscErrorCode ierr; 6399ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 6409ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 6419ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 6429ae82921SPaul Mullowney PetscBool row_identity,col_identity; 6439ae82921SPaul Mullowney PetscInt n = A->rmap->n; 6449ae82921SPaul Mullowney 6459ae82921SPaul Mullowney PetscFunctionBegin; 646da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 647087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 648087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 6492205254eSKarl Rupp 650da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 651aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 6529ae82921SPaul Mullowney 653c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 654e057df02SPaul Mullowney /* lower triangular indices */ 6559ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 656da79fbbcSStefano Zampini if (!row_identity && !cusparseTriFactors->rpermIndices) { 657da79fbbcSStefano Zampini const PetscInt *r; 658da79fbbcSStefano Zampini 659da79fbbcSStefano Zampini ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 660aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 661aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 6629ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 663da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 664da79fbbcSStefano Zampini } 6659ae82921SPaul Mullowney 666e057df02SPaul Mullowney /* upper triangular indices */ 6679ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 668da79fbbcSStefano Zampini if (!col_identity && !cusparseTriFactors->cpermIndices) { 669da79fbbcSStefano Zampini const PetscInt *c; 670da79fbbcSStefano Zampini 671da79fbbcSStefano Zampini ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 672aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 673aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 6749ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 675da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 676da79fbbcSStefano Zampini } 6779ae82921SPaul Mullowney PetscFunctionReturn(0); 6789ae82921SPaul Mullowney } 6799ae82921SPaul Mullowney 680087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 681087f3262SPaul Mullowney { 682087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 683087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 684aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 685aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 686087f3262SPaul Mullowney cusparseStatus_t stat; 687087f3262SPaul Mullowney PetscErrorCode ierr; 68857d48284SJunchao Zhang cudaError_t cerr; 689087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 690087f3262SPaul Mullowney PetscScalar *AAUp; 691087f3262SPaul Mullowney PetscScalar *AALo; 692087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 693087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 694087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 695087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 696087f3262SPaul Mullowney 697087f3262SPaul Mullowney PetscFunctionBegin; 698cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 699c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 700087f3262SPaul Mullowney try { 701da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 702da79fbbcSStefano Zampini cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 703da79fbbcSStefano Zampini if (!upTriFactor && !loTriFactor) { 704087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 70557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 70657d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 707087f3262SPaul Mullowney 708087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 709087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 710087f3262SPaul Mullowney AiUp[n]=nzUpper; 711087f3262SPaul Mullowney offset = 0; 712087f3262SPaul Mullowney for (i=0; i<n; i++) { 713087f3262SPaul Mullowney /* set the pointers */ 714087f3262SPaul Mullowney v = aa + ai[i]; 715087f3262SPaul Mullowney vj = aj + ai[i]; 716087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 717087f3262SPaul Mullowney 718087f3262SPaul Mullowney /* first, set the diagonal elements */ 719087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 72009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 721087f3262SPaul Mullowney AiUp[i] = offset; 72209f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 723087f3262SPaul Mullowney 724087f3262SPaul Mullowney offset+=1; 725087f3262SPaul Mullowney if (nz>0) { 726f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 727580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 728087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 729087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 730087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 731087f3262SPaul Mullowney } 732087f3262SPaul Mullowney offset+=nz; 733087f3262SPaul Mullowney } 734087f3262SPaul Mullowney } 735087f3262SPaul Mullowney 736aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 737da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactor);CHKERRQ(ierr); 738da79fbbcSStefano Zampini upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 739087f3262SPaul Mullowney 740aa372e3fSPaul Mullowney /* Create the matrix description */ 74157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 74257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 7431b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 744afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 745afb2bd1cSJunchao Zhang #else 74657d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 747afb2bd1cSJunchao Zhang #endif 74857d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 74957d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 750087f3262SPaul Mullowney 751aa372e3fSPaul Mullowney /* set the matrix */ 752aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 753aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 754aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 755aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 756aa372e3fSPaul Mullowney 757aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 758aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 759aa372e3fSPaul Mullowney 760aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 761aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 762aa372e3fSPaul Mullowney 763aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 764aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 765aa372e3fSPaul Mullowney 766afb2bd1cSJunchao Zhang /* set the operation */ 767afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 768afb2bd1cSJunchao Zhang 769afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 770da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 771afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 7721b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 773afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 774afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 775afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 776afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 777afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 778afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 779afb2bd1cSJunchao Zhang #endif 780afb2bd1cSJunchao Zhang 781aa372e3fSPaul Mullowney /* perform the solve analysis */ 782aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 783aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 784aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 785afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 7861b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 787afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 788afb2bd1cSJunchao Zhang #endif 789afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 790da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 791da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 792aa372e3fSPaul Mullowney 793da79fbbcSStefano Zampini /* assign the pointer */ 794aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 795aa372e3fSPaul Mullowney 796aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 797da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactor);CHKERRQ(ierr); 798da79fbbcSStefano Zampini loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 799aa372e3fSPaul Mullowney 800aa372e3fSPaul Mullowney /* Create the matrix description */ 80157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 80257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 8031b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 804afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 805afb2bd1cSJunchao Zhang #else 80657d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 807afb2bd1cSJunchao Zhang #endif 80857d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 80957d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 810aa372e3fSPaul Mullowney 811aa372e3fSPaul Mullowney /* set the operation */ 812aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 813aa372e3fSPaul Mullowney 814aa372e3fSPaul Mullowney /* set the matrix */ 815aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 816aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 817aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 818aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 819aa372e3fSPaul Mullowney 820aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 821aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 822aa372e3fSPaul Mullowney 823aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 824aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 825aa372e3fSPaul Mullowney 826aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 827aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 828aa372e3fSPaul Mullowney 829afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 830da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 831afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 8321b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 833afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 834afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 835afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 836afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 837afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 838afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 839afb2bd1cSJunchao Zhang #endif 840afb2bd1cSJunchao Zhang 841aa372e3fSPaul Mullowney /* perform the solve analysis */ 842aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 843aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 844aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 845afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 8461b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 847afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 848afb2bd1cSJunchao Zhang #endif 849afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 850da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 851da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 852aa372e3fSPaul Mullowney 853da79fbbcSStefano Zampini /* assign the pointer */ 854aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 855087f3262SPaul Mullowney 856da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 85757d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 85857d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 859da79fbbcSStefano Zampini } else { 860da79fbbcSStefano Zampini /* Fill the upper triangular matrix */ 861da79fbbcSStefano Zampini offset = 0; 862da79fbbcSStefano Zampini for (i=0; i<n; i++) { 863da79fbbcSStefano Zampini /* set the pointers */ 864da79fbbcSStefano Zampini v = aa + ai[i]; 865da79fbbcSStefano Zampini nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 866da79fbbcSStefano Zampini 867da79fbbcSStefano Zampini /* first, set the diagonal elements */ 868da79fbbcSStefano Zampini AAUp[offset] = 1.0/v[nz]; 869da79fbbcSStefano Zampini AALo[offset] = 1.0/v[nz]; 870da79fbbcSStefano Zampini 871da79fbbcSStefano Zampini offset+=1; 872da79fbbcSStefano Zampini if (nz>0) { 873da79fbbcSStefano Zampini ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 874da79fbbcSStefano Zampini for (j=offset; j<offset+nz; j++) { 875da79fbbcSStefano Zampini AAUp[j] = -AAUp[j]; 876da79fbbcSStefano Zampini AALo[j] = AAUp[j]/v[nz]; 877da79fbbcSStefano Zampini } 878da79fbbcSStefano Zampini offset+=nz; 879da79fbbcSStefano Zampini } 880da79fbbcSStefano Zampini } 881da79fbbcSStefano Zampini if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 882da79fbbcSStefano Zampini if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 883da79fbbcSStefano Zampini upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 884da79fbbcSStefano Zampini loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 885da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 886da79fbbcSStefano Zampini } 88757d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 88857d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 889087f3262SPaul Mullowney } catch(char *ex) { 890087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 891087f3262SPaul Mullowney } 892087f3262SPaul Mullowney } 893087f3262SPaul Mullowney PetscFunctionReturn(0); 894087f3262SPaul Mullowney } 895087f3262SPaul Mullowney 896087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 8979ae82921SPaul Mullowney { 8989ae82921SPaul Mullowney PetscErrorCode ierr; 899087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 900087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 901087f3262SPaul Mullowney IS ip = a->row; 902087f3262SPaul Mullowney PetscBool perm_identity; 903087f3262SPaul Mullowney PetscInt n = A->rmap->n; 904087f3262SPaul Mullowney 905087f3262SPaul Mullowney PetscFunctionBegin; 906da79fbbcSStefano Zampini if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors"); 907087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 908da79fbbcSStefano Zampini if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); } 909aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 910aa372e3fSPaul Mullowney 911da79fbbcSStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 912da79fbbcSStefano Zampini 913087f3262SPaul Mullowney /* lower triangular indices */ 914087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 915087f3262SPaul Mullowney if (!perm_identity) { 9164e4bbfaaSStefano Zampini IS iip; 917da79fbbcSStefano Zampini const PetscInt *irip,*rip; 9184e4bbfaaSStefano Zampini 9194e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 9204e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 921da79fbbcSStefano Zampini ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 922aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 923aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 924aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 9254e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 9264e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 9274e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 928087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 929da79fbbcSStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 930da79fbbcSStefano Zampini } 931087f3262SPaul Mullowney PetscFunctionReturn(0); 932087f3262SPaul Mullowney } 933087f3262SPaul Mullowney 9346fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 9359ae82921SPaul Mullowney { 9369ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 9379ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 9389ae82921SPaul Mullowney PetscBool row_identity,col_identity; 939b175d8bbSPaul Mullowney PetscErrorCode ierr; 9409ae82921SPaul Mullowney 9419ae82921SPaul Mullowney PetscFunctionBegin; 94257181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 9439ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 944ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 945e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 9469ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 9479ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 948bda325fcSPaul Mullowney if (row_identity && col_identity) { 949bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 950bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9514e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9524e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 953bda325fcSPaul Mullowney } else { 954bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 955bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9564e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9574e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 958bda325fcSPaul Mullowney } 9598dc1d2a3SPaul Mullowney 960e057df02SPaul Mullowney /* get the triangular factors */ 961087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 9629ae82921SPaul Mullowney PetscFunctionReturn(0); 9639ae82921SPaul Mullowney } 9649ae82921SPaul Mullowney 965087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 966087f3262SPaul Mullowney { 967087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 968087f3262SPaul Mullowney IS ip = b->row; 969087f3262SPaul Mullowney PetscBool perm_identity; 970b175d8bbSPaul Mullowney PetscErrorCode ierr; 971087f3262SPaul Mullowney 972087f3262SPaul Mullowney PetscFunctionBegin; 97357181aedSStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 974087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 975ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 976087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 977087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 978087f3262SPaul Mullowney if (perm_identity) { 979087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 980087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 9814e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9824e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 983087f3262SPaul Mullowney } else { 984087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 985087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 9864e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 9874e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 988087f3262SPaul Mullowney } 989087f3262SPaul Mullowney 990087f3262SPaul Mullowney /* get the triangular factors */ 991087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 992087f3262SPaul Mullowney PetscFunctionReturn(0); 993087f3262SPaul Mullowney } 9949ae82921SPaul Mullowney 995b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 996bda325fcSPaul Mullowney { 997bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 998aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 999aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1000da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1001da79fbbcSStefano Zampini Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1002bda325fcSPaul Mullowney cusparseStatus_t stat; 1003aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1004aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 1005aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 1006aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 10071b0a6780SStefano Zampini cudaError_t cerr; 1008da79fbbcSStefano Zampini PetscErrorCode ierr; 1009b175d8bbSPaul Mullowney 1010bda325fcSPaul Mullowney PetscFunctionBegin; 1011aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 1012da79fbbcSStefano Zampini ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr); 1013da79fbbcSStefano Zampini loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1014aa372e3fSPaul Mullowney 1015aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 1016aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 1017aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1018aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1019aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1020aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 1021aa372e3fSPaul Mullowney 1022aa372e3fSPaul Mullowney /* Create the matrix description */ 102357d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 102457d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 102557d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 102657d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 102757d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1028aa372e3fSPaul Mullowney 1029aa372e3fSPaul Mullowney /* set the operation */ 1030aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1031aa372e3fSPaul Mullowney 1032aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 1033aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 1034afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1035afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1036aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1037afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 1038afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1039afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1040aa372e3fSPaul Mullowney 1041aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1042afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1043afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1044afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1045afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 1046afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 1047afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 1048afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 1049afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1050afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1051afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 10521b0a6780SStefano Zampini cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1053afb2bd1cSJunchao Zhang #endif 1054afb2bd1cSJunchao Zhang 1055da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1056aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 1057aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 1058aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1059aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1060aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1061aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1062afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1063afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1064afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1065afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 1066afb2bd1cSJunchao Zhang #else 1067afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1068afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1069afb2bd1cSJunchao Zhang #endif 1070afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1071da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1072da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1073aa372e3fSPaul Mullowney 1074afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1075da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1076afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 10771b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1078afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 1079afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1080afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1081afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 1082afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1083afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1084afb2bd1cSJunchao Zhang #endif 1085afb2bd1cSJunchao Zhang 1086afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1087aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 1088afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 1089afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1090afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 10911b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1092afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1093afb2bd1cSJunchao Zhang #endif 1094afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1095da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1096da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1097aa372e3fSPaul Mullowney 1098da79fbbcSStefano Zampini /* assign the pointer */ 1099aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1100aa372e3fSPaul Mullowney 1101aa372e3fSPaul Mullowney /*********************************************/ 1102aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 1103aa372e3fSPaul Mullowney /*********************************************/ 1104aa372e3fSPaul Mullowney 1105aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 1106da79fbbcSStefano Zampini ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr); 1107da79fbbcSStefano Zampini upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1108aa372e3fSPaul Mullowney 1109aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 1110aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 1111aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1112aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1113aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1114aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1115aa372e3fSPaul Mullowney 1116aa372e3fSPaul Mullowney /* Create the matrix description */ 111757d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 111857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 111957d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 112057d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 112157d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1122aa372e3fSPaul Mullowney 1123aa372e3fSPaul Mullowney /* set the operation */ 1124aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1125aa372e3fSPaul Mullowney 1126aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1127aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1128afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1129afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1130aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1131afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1132afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1133afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1134aa372e3fSPaul Mullowney 1135aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1136afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1137afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1138afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1139afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1140afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1141afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1142afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1143afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1144afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1145afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1146afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1147afb2bd1cSJunchao Zhang #endif 1148afb2bd1cSJunchao Zhang 1149da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1150aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1151aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1152aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1153aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1154aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1155aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1156afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1157afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1158afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1159afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1160afb2bd1cSJunchao Zhang #else 1161afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1163afb2bd1cSJunchao Zhang #endif 1164afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1165da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1166da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1167aa372e3fSPaul Mullowney 1168afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1169da79fbbcSStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1170afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 11711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1172afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1173afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1174afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1175afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1176afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1177afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1178afb2bd1cSJunchao Zhang #endif 1179afb2bd1cSJunchao Zhang 1180afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1181aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1182afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1183afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1184afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 11851b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1186afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1187afb2bd1cSJunchao Zhang #endif 1188afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1189da79fbbcSStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1190da79fbbcSStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr); 1191aa372e3fSPaul Mullowney 1192da79fbbcSStefano Zampini /* assign the pointer */ 1193aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1194bda325fcSPaul Mullowney PetscFunctionReturn(0); 1195bda325fcSPaul Mullowney } 1196bda325fcSPaul Mullowney 1197b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1198bda325fcSPaul Mullowney { 1199aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1200aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1201aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1202bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203bda325fcSPaul Mullowney cusparseStatus_t stat; 1204aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1205b06137fdSPaul Mullowney cudaError_t err; 120685ba7357SStefano Zampini PetscErrorCode ierr; 1207b175d8bbSPaul Mullowney 1208bda325fcSPaul Mullowney PetscFunctionBegin; 1209fcdce8c4SStefano Zampini if (!cusparsestruct->transgen || cusparsestruct->matTranspose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0); 121085ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 121185ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 121285ba7357SStefano Zampini /* create cusparse matrix */ 1213aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 121457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1215aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 121657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 121757d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1218aa372e3fSPaul Mullowney 1219b06137fdSPaul Mullowney /* set alpha and beta */ 1220afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 12217656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 12227656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1223afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12247656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 12257656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 122657d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1227b06137fdSPaul Mullowney 1228aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1229aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1230aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 1231554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1232554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1233aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1234a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1235aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1236aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1237a3fdcf43SKarl Rupp 123881902715SJunchao Zhang cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 123981902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1240afb2bd1cSJunchao Zhang 124181902715SJunchao Zhang /* compute the transpose, i.e. the CSC */ 1242afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1243afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1244afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1245afb2bd1cSJunchao Zhang matrix->values->data().get(), 1246afb2bd1cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1247afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1248afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1249afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1250afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1251afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1252afb2bd1cSJunchao Zhang err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1253afb2bd1cSJunchao Zhang #endif 1254afb2bd1cSJunchao Zhang 1255a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1256a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 1257aa372e3fSPaul Mullowney matrix->values->data().get(), 125881902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1259aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1260aa372e3fSPaul Mullowney matrixT->values->data().get(), 1261afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1262afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1263afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1264afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1265afb2bd1cSJunchao Zhang #else 1266afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1267afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1268afb2bd1cSJunchao Zhang #endif 1269afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1270aa372e3fSPaul Mullowney matstructT->mat = matrixT; 1271afb2bd1cSJunchao Zhang 1272afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1273afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1274afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1275afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1276afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1277afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1278afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1279afb2bd1cSJunchao Zhang #endif 1280aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1281afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1282afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1283afb2bd1cSJunchao Zhang #else 1284aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 128551c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 128651c6d536SStefano Zampini /* First convert HYB to CSR */ 1287aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1288aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1289aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1290aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1291aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1292aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1293aa372e3fSPaul Mullowney 1294aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1295aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1296aa372e3fSPaul Mullowney temp->values->data().get(), 1297aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 129857d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1299aa372e3fSPaul Mullowney 1300aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1301aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1302aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1303aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1304aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1305aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1306aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1307aa372e3fSPaul Mullowney 1308aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1309aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1310aa372e3fSPaul Mullowney temp->values->data().get(), 1311aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1312aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1313aa372e3fSPaul Mullowney tempT->values->data().get(), 1314aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1315aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 131657d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1317aa372e3fSPaul Mullowney 1318aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1319aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 132057d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1321aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1322aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1323aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1324aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1325aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1326aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 132757d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1328aa372e3fSPaul Mullowney 1329aa372e3fSPaul Mullowney /* assign the pointer */ 1330aa372e3fSPaul Mullowney matstructT->mat = hybMat; 1331aa372e3fSPaul Mullowney /* delete temporaries */ 1332aa372e3fSPaul Mullowney if (tempT) { 1333aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1334aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1335aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1336aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1337087f3262SPaul Mullowney } 1338aa372e3fSPaul Mullowney if (temp) { 1339aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1340aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1341aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1342aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1343aa372e3fSPaul Mullowney } 1344afb2bd1cSJunchao Zhang #endif 1345aa372e3fSPaul Mullowney } 134605035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 134785ba7357SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 134885ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1349213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1350213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1351aa372e3fSPaul Mullowney /* assign the pointer */ 1352aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1353bda325fcSPaul Mullowney PetscFunctionReturn(0); 1354bda325fcSPaul Mullowney } 1355bda325fcSPaul Mullowney 13564e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 13576fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1358bda325fcSPaul Mullowney { 1359c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1360465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1361465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1362465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1363465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1364bda325fcSPaul Mullowney cusparseStatus_t stat; 1365bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1366aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1367aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1368aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1369b175d8bbSPaul Mullowney PetscErrorCode ierr; 137057d48284SJunchao Zhang cudaError_t cerr; 1371bda325fcSPaul Mullowney 1372bda325fcSPaul Mullowney PetscFunctionBegin; 1373aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1374aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1375bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1376aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1377aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1378bda325fcSPaul Mullowney } 1379bda325fcSPaul Mullowney 1380bda325fcSPaul Mullowney /* Get the GPU pointers */ 1381c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1382c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1383c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1384c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1385bda325fcSPaul Mullowney 13867a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1387aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1388c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1389c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1390c41cb2e2SAlejandro Lamas Daviña xGPU); 1391aa372e3fSPaul Mullowney 1392aa372e3fSPaul Mullowney /* First, solve U */ 1393aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1394afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 13951b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1396afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1397afb2bd1cSJunchao Zhang #endif 1398afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1399aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1400aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1401aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1402aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1403afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 14041b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1405afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1406afb2bd1cSJunchao Zhang #endif 1407afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1408aa372e3fSPaul Mullowney 1409aa372e3fSPaul Mullowney /* Then, solve L */ 1410aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1411afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14121b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1413afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1414afb2bd1cSJunchao Zhang #endif 1415afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1416aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1417aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1418aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1419aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1420afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14211b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1422afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1423afb2bd1cSJunchao Zhang #endif 1424afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1425aa372e3fSPaul Mullowney 1426aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1427c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1428c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1429aa372e3fSPaul Mullowney tempGPU->begin()); 1430aa372e3fSPaul Mullowney 1431aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1432c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1433bda325fcSPaul Mullowney 1434bda325fcSPaul Mullowney /* restore */ 1435c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1436c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 143705035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1438661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1439958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1440bda325fcSPaul Mullowney PetscFunctionReturn(0); 1441bda325fcSPaul Mullowney } 1442bda325fcSPaul Mullowney 14436fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444bda325fcSPaul Mullowney { 1445465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1446465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1447bda325fcSPaul Mullowney cusparseStatus_t stat; 1448bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1449aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1450aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1451aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1452b175d8bbSPaul Mullowney PetscErrorCode ierr; 145357d48284SJunchao Zhang cudaError_t cerr; 1454bda325fcSPaul Mullowney 1455bda325fcSPaul Mullowney PetscFunctionBegin; 1456aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1457aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1458bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1459aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1460aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1461bda325fcSPaul Mullowney } 1462bda325fcSPaul Mullowney 1463bda325fcSPaul Mullowney /* Get the GPU pointers */ 1464c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1465c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1466bda325fcSPaul Mullowney 14677a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1468aa372e3fSPaul Mullowney /* First, solve U */ 1469aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1470afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 14711b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1472afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1473afb2bd1cSJunchao Zhang #endif 1474afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1475aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1476aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1477aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1478aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1479afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 14801b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1481afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1482afb2bd1cSJunchao Zhang #endif 1483afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1484aa372e3fSPaul Mullowney 1485aa372e3fSPaul Mullowney /* Then, solve L */ 1486aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1487afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 14881b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1489afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1490afb2bd1cSJunchao Zhang #endif 1491afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1492aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1493aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1494aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1495aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1496afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 14971b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1498afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1499afb2bd1cSJunchao Zhang #endif 1500afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1501bda325fcSPaul Mullowney 1502bda325fcSPaul Mullowney /* restore */ 1503c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1504c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 150505035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1506661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1508bda325fcSPaul Mullowney PetscFunctionReturn(0); 1509bda325fcSPaul Mullowney } 1510bda325fcSPaul Mullowney 15116fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 15129ae82921SPaul Mullowney { 1513465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1514465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1515465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1516465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 15179ae82921SPaul Mullowney cusparseStatus_t stat; 15189ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1519aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1520aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1521aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1522b175d8bbSPaul Mullowney PetscErrorCode ierr; 152357d48284SJunchao Zhang cudaError_t cerr; 15249ae82921SPaul Mullowney 15259ae82921SPaul Mullowney PetscFunctionBegin; 1526ebc8f436SDominic Meiser 1527e057df02SPaul Mullowney /* Get the GPU pointers */ 1528c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1529c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1530c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1531c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 15329ae82921SPaul Mullowney 15337a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1534aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1535c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1536c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 15374e4bbfaaSStefano Zampini tempGPU->begin()); 1538aa372e3fSPaul Mullowney 1539aa372e3fSPaul Mullowney /* Next, solve L */ 1540aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1541afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 15421b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1543afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1544afb2bd1cSJunchao Zhang #endif 1545afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1546aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1547aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1548aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1549aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1550afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 15511b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1552afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1553afb2bd1cSJunchao Zhang #endif 1554afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1555aa372e3fSPaul Mullowney 1556aa372e3fSPaul Mullowney /* Then, solve U */ 1557aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1558afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 15591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1560afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1561afb2bd1cSJunchao Zhang #endif 1562afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1563aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1564aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1565aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1566aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1567afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 15681b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1569afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1570afb2bd1cSJunchao Zhang #endif 1571afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1572aa372e3fSPaul Mullowney 15734e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 15744e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 15754e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 15764e4bbfaaSStefano Zampini xGPU); 15779ae82921SPaul Mullowney 1578c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1579c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 158005035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1581661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1582958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 15839ae82921SPaul Mullowney PetscFunctionReturn(0); 15849ae82921SPaul Mullowney } 15859ae82921SPaul Mullowney 15866fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 15879ae82921SPaul Mullowney { 1588465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1589465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 15909ae82921SPaul Mullowney cusparseStatus_t stat; 15919ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1592aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1593aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1594aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1595b175d8bbSPaul Mullowney PetscErrorCode ierr; 159657d48284SJunchao Zhang cudaError_t cerr; 15979ae82921SPaul Mullowney 15989ae82921SPaul Mullowney PetscFunctionBegin; 1599e057df02SPaul Mullowney /* Get the GPU pointers */ 1600c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1601c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 16029ae82921SPaul Mullowney 16037a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1604aa372e3fSPaul Mullowney /* First, solve L */ 1605aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1606afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 16071b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1608afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1609afb2bd1cSJunchao Zhang #endif 1610afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1611aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1612aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1613aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1614aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1615afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 16161b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1617afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1618afb2bd1cSJunchao Zhang #endif 1619afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1620aa372e3fSPaul Mullowney 1621aa372e3fSPaul Mullowney /* Next, solve U */ 1622aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1623afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 16241b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1625afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1626afb2bd1cSJunchao Zhang #endif 1627afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1628aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1629aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1630aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1631aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1632afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 16331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0) 1634afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1635afb2bd1cSJunchao Zhang #endif 1636afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 16379ae82921SPaul Mullowney 1638c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1639c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 164005035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1641661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1642958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 16439ae82921SPaul Mullowney PetscFunctionReturn(0); 16449ae82921SPaul Mullowney } 16459ae82921SPaul Mullowney 16467e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 16477e8381f9SStefano Zampini { 16487e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 16497e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 16507e8381f9SStefano Zampini cudaError_t cerr; 16517e8381f9SStefano Zampini PetscErrorCode ierr; 16527e8381f9SStefano Zampini 16537e8381f9SStefano Zampini PetscFunctionBegin; 16547e8381f9SStefano Zampini if (A->offloadmask == PETSC_OFFLOAD_GPU) { 16557e8381f9SStefano Zampini CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat; 16567e8381f9SStefano Zampini 16577e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 16587e8381f9SStefano Zampini cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 16597e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 16607e8381f9SStefano Zampini ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr); 16617e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr); 16627e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 16637e8381f9SStefano Zampini } 16647e8381f9SStefano Zampini PetscFunctionReturn(0); 16657e8381f9SStefano Zampini } 16667e8381f9SStefano Zampini 16677e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[]) 16687e8381f9SStefano Zampini { 16697e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 16707e8381f9SStefano Zampini PetscErrorCode ierr; 16717e8381f9SStefano Zampini 16727e8381f9SStefano Zampini PetscFunctionBegin; 16737e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 16747e8381f9SStefano Zampini *array = a->a; 16757e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 16767e8381f9SStefano Zampini PetscFunctionReturn(0); 16777e8381f9SStefano Zampini } 16787e8381f9SStefano Zampini 16796fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 16809ae82921SPaul Mullowney { 1681aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 16827c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 16839ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1684213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 16859ae82921SPaul Mullowney PetscErrorCode ierr; 1686aa372e3fSPaul Mullowney cusparseStatus_t stat; 1687b06137fdSPaul Mullowney cudaError_t err; 16889ae82921SPaul Mullowney 16899ae82921SPaul Mullowney PetscFunctionBegin; 1690fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU"); 1691c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 1692fcdce8c4SStefano Zampini if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 169381902715SJunchao Zhang /* Copy values only */ 1694afb2bd1cSJunchao Zhang CsrMatrix *matrix,*matrixT; 1695afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 169685ba7357SStefano Zampini 169785ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1698afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 169905035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 17004863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 170185ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 170281902715SJunchao Zhang 170381902715SJunchao Zhang /* Update matT when it was built before */ 170481902715SJunchao Zhang if (cusparsestruct->matTranspose) { 170581902715SJunchao Zhang cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1706afb2bd1cSJunchao Zhang matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 170785ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 170881902715SJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1709afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1710afb2bd1cSJunchao Zhang matrix->values->data().get(), 171181902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1712afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1713afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1714afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1715afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1716afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1717afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1718afb2bd1cSJunchao Zhang #else 1719afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1720afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1721afb2bd1cSJunchao Zhang #endif 1722afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 172305035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 172485ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 172581902715SJunchao Zhang } 172634d6c7a5SJose E. Roman } else { 172785ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 17287c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 17297c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 17307c700b8dSJunchao Zhang delete cusparsestruct->workVector; 173181902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 17329ae82921SPaul Mullowney try { 17339ae82921SPaul Mullowney if (a->compressedrow.use) { 17349ae82921SPaul Mullowney m = a->compressedrow.nrows; 17359ae82921SPaul Mullowney ii = a->compressedrow.i; 17369ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 17379ae82921SPaul Mullowney } else { 1738213423ffSJunchao Zhang m = A->rmap->n; 1739213423ffSJunchao Zhang ii = a->i; 1740e6e9a74fSStefano Zampini ridx = NULL; 17419ae82921SPaul Mullowney } 1742213423ffSJunchao Zhang cusparsestruct->nrows = m; 17439ae82921SPaul Mullowney 174485ba7357SStefano Zampini /* create cusparse matrix */ 1745aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 174657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 174757d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 174857d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 17499ae82921SPaul Mullowney 1750afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err); 17517656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 17527656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1753afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17547656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 17557656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 175657d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1757b06137fdSPaul Mullowney 1758aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1759aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1760aa372e3fSPaul Mullowney /* set the matrix */ 1761afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1762afb2bd1cSJunchao Zhang mat->num_rows = m; 1763afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1764afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1765afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1766afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 17679ae82921SPaul Mullowney 1768afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1769afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1770aa372e3fSPaul Mullowney 1771afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1772afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1773aa372e3fSPaul Mullowney 1774aa372e3fSPaul Mullowney /* assign the pointer */ 1775afb2bd1cSJunchao Zhang matstruct->mat = mat; 1776afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1777afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1778afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1779afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1780afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1781afb2bd1cSJunchao Zhang mat->values->data().get(), 1782afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1783afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1784afb2bd1cSJunchao Zhang } 1785afb2bd1cSJunchao Zhang #endif 1786aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1787afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1788afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1789afb2bd1cSJunchao Zhang #else 1790afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1791afb2bd1cSJunchao Zhang mat->num_rows = m; 1792afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1793afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1794afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1795afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1796aa372e3fSPaul Mullowney 1797afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1798afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1799aa372e3fSPaul Mullowney 1800afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1801afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1802aa372e3fSPaul Mullowney 1803aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 180457d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1805aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1806aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1807afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1808afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1809afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1810afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 181157d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1812aa372e3fSPaul Mullowney /* assign the pointer */ 1813aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1814aa372e3fSPaul Mullowney 1815afb2bd1cSJunchao Zhang if (mat) { 1816afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1817afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1818afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1819afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1820087f3262SPaul Mullowney } 1821afb2bd1cSJunchao Zhang #endif 1822087f3262SPaul Mullowney } 1823ca45077fSPaul Mullowney 1824aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1825213423ffSJunchao Zhang if (a->compressedrow.use) { 1826213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1827aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1828aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1829213423ffSJunchao Zhang tmp = m; 1830213423ffSJunchao Zhang } else { 1831213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1832213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1833213423ffSJunchao Zhang tmp = 0; 1834213423ffSJunchao Zhang } 1835213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1836aa372e3fSPaul Mullowney 1837aa372e3fSPaul Mullowney /* assign the pointer */ 1838aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 18399ae82921SPaul Mullowney } catch(char *ex) { 18409ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 18419ae82921SPaul Mullowney } 184205035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 184385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 184434d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 184534d6c7a5SJose E. Roman } 1846c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 18479ae82921SPaul Mullowney } 18489ae82921SPaul Mullowney PetscFunctionReturn(0); 18499ae82921SPaul Mullowney } 18509ae82921SPaul Mullowney 1851c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1852aa372e3fSPaul Mullowney { 1853aa372e3fSPaul Mullowney template <typename Tuple> 1854aa372e3fSPaul Mullowney __host__ __device__ 1855aa372e3fSPaul Mullowney void operator()(Tuple t) 1856aa372e3fSPaul Mullowney { 1857aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1858aa372e3fSPaul Mullowney } 1859aa372e3fSPaul Mullowney }; 1860aa372e3fSPaul Mullowney 18617e8381f9SStefano Zampini struct VecCUDAEquals 18627e8381f9SStefano Zampini { 18637e8381f9SStefano Zampini template <typename Tuple> 18647e8381f9SStefano Zampini __host__ __device__ 18657e8381f9SStefano Zampini void operator()(Tuple t) 18667e8381f9SStefano Zampini { 18677e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 18687e8381f9SStefano Zampini } 18697e8381f9SStefano Zampini }; 18707e8381f9SStefano Zampini 1871e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1872e6e9a74fSStefano Zampini { 1873e6e9a74fSStefano Zampini template <typename Tuple> 1874e6e9a74fSStefano Zampini __host__ __device__ 1875e6e9a74fSStefano Zampini void operator()(Tuple t) 1876e6e9a74fSStefano Zampini { 1877e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1878e6e9a74fSStefano Zampini } 1879e6e9a74fSStefano Zampini }; 1880e6e9a74fSStefano Zampini 1881afb2bd1cSJunchao Zhang struct MatMatCusparse { 1882ccdfe979SStefano Zampini PetscBool cisdense; 1883ccdfe979SStefano Zampini PetscScalar *Bt; 1884ccdfe979SStefano Zampini Mat X; 1885fcdce8c4SStefano Zampini PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 1886fcdce8c4SStefano Zampini PetscLogDouble flops; 1887fcdce8c4SStefano Zampini CsrMatrix *Bcsr; 1888afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1889fcdce8c4SStefano Zampini cusparseSpMatDescr_t matSpBDescr; 1890afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1891afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1892afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1893afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1894fcdce8c4SStefano Zampini size_t mmBufferSize; 1895fcdce8c4SStefano Zampini void *mmBuffer; 1896fcdce8c4SStefano Zampini void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 1897fcdce8c4SStefano Zampini cusparseSpGEMMDescr_t spgemmDesc; 1898afb2bd1cSJunchao Zhang #endif 1899afb2bd1cSJunchao Zhang }; 1900ccdfe979SStefano Zampini 1901ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1902ccdfe979SStefano Zampini { 1903ccdfe979SStefano Zampini PetscErrorCode ierr; 1904ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1905ccdfe979SStefano Zampini cudaError_t cerr; 1906fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1907fcdce8c4SStefano Zampini cusparseStatus_t stat; 1908fcdce8c4SStefano Zampini #endif 1909ccdfe979SStefano Zampini 1910ccdfe979SStefano Zampini PetscFunctionBegin; 1911ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1912fcdce8c4SStefano Zampini delete mmdata->Bcsr; 1913afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1914fcdce8c4SStefano Zampini if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); } 1915fcdce8c4SStefano Zampini if (mmdata->mmBuffer) { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); } 1916fcdce8c4SStefano Zampini if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); } 1917afb2bd1cSJunchao Zhang if (mmdata->matBDescr) { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); } 1918afb2bd1cSJunchao Zhang if (mmdata->matCDescr) { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); } 1919fcdce8c4SStefano Zampini if (mmdata->spgemmDesc) { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); } 1920afb2bd1cSJunchao Zhang #endif 1921ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1922ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1923ccdfe979SStefano Zampini PetscFunctionReturn(0); 1924ccdfe979SStefano Zampini } 1925ccdfe979SStefano Zampini 1926ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1927ccdfe979SStefano Zampini 1928ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1929ccdfe979SStefano Zampini { 1930ccdfe979SStefano Zampini Mat_Product *product = C->product; 1931ccdfe979SStefano Zampini Mat A,B; 1932afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1933ccdfe979SStefano Zampini PetscBool flg,biscuda; 1934ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1935ccdfe979SStefano Zampini cusparseStatus_t stat; 1936ccdfe979SStefano Zampini cusparseOperation_t opA; 1937ccdfe979SStefano Zampini const PetscScalar *barray; 1938ccdfe979SStefano Zampini PetscScalar *carray; 1939ccdfe979SStefano Zampini PetscErrorCode ierr; 1940ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1941ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1942ccdfe979SStefano Zampini CsrMatrix *csrmat; 1943afb2bd1cSJunchao Zhang cudaError_t cerr; 1944ccdfe979SStefano Zampini 1945ccdfe979SStefano Zampini PetscFunctionBegin; 1946ccdfe979SStefano Zampini MatCheckProduct(C,1); 1947ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1948ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1949ccdfe979SStefano Zampini A = product->A; 1950ccdfe979SStefano Zampini B = product->B; 1951ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1952ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1953ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1954ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1955ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1956ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1957ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1958ccdfe979SStefano Zampini switch (product->type) { 1959ccdfe979SStefano Zampini case MATPRODUCT_AB: 1960ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1961ccdfe979SStefano Zampini mat = cusp->mat; 1962ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1963ccdfe979SStefano Zampini m = A->rmap->n; 1964ccdfe979SStefano Zampini n = B->cmap->n; 1965ccdfe979SStefano Zampini break; 1966ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1967e6e9a74fSStefano Zampini if (!cusp->transgen) { 1968e6e9a74fSStefano Zampini mat = cusp->mat; 1969e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1970e6e9a74fSStefano Zampini } else { 1971ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1972ccdfe979SStefano Zampini mat = cusp->matTranspose; 1973ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1974e6e9a74fSStefano Zampini } 1975ccdfe979SStefano Zampini m = A->cmap->n; 1976ccdfe979SStefano Zampini n = B->cmap->n; 1977ccdfe979SStefano Zampini break; 1978ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1979ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1980ccdfe979SStefano Zampini mat = cusp->mat; 1981ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1982ccdfe979SStefano Zampini m = A->rmap->n; 1983ccdfe979SStefano Zampini n = B->rmap->n; 1984ccdfe979SStefano Zampini break; 1985ccdfe979SStefano Zampini default: 1986ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1987ccdfe979SStefano Zampini } 1988ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1989ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 1990ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 1991ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1992afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1993ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1994afb2bd1cSJunchao Zhang 1995ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1996c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1997c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1998c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1999c8378d12SStefano Zampini } else { 2000c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 2001c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 2002c8378d12SStefano Zampini } 2003c8378d12SStefano Zampini 2004c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2005afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2006afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2007fcdce8c4SStefano Zampini /* (re)allcoate mmBuffer if not initialized or LDAs are different */ 2008afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2009fcdce8c4SStefano Zampini size_t mmBufferSize; 2010afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 2011afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 2012afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2013afb2bd1cSJunchao Zhang mmdata->Blda = blda; 2014afb2bd1cSJunchao Zhang } 2015c8378d12SStefano Zampini 2016afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 2017afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2018afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 2019afb2bd1cSJunchao Zhang mmdata->Clda = clda; 2020afb2bd1cSJunchao Zhang } 2021afb2bd1cSJunchao Zhang 2022afb2bd1cSJunchao Zhang if (!mat->matDescr) { 2023afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 2024afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 2025afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 2026afb2bd1cSJunchao Zhang csrmat->values->data().get(), 2027afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2028afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 2029afb2bd1cSJunchao Zhang } 2030afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 2031afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2032afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2033fcdce8c4SStefano Zampini cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat); 2034fcdce8c4SStefano Zampini if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2035fcdce8c4SStefano Zampini cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); 2036fcdce8c4SStefano Zampini cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr); 2037fcdce8c4SStefano Zampini mmdata->mmBufferSize = mmBufferSize; 2038fcdce8c4SStefano Zampini } 2039afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 2040afb2bd1cSJunchao Zhang } else { 2041afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 2042afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 2043afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 2044afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 2045afb2bd1cSJunchao Zhang } 2046afb2bd1cSJunchao Zhang 2047afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 2048afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 2049afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 2050afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 2051fcdce8c4SStefano Zampini cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2052afb2bd1cSJunchao Zhang #else 2053afb2bd1cSJunchao Zhang PetscInt k; 2054afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 2055ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2056ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 2057ccdfe979SStefano Zampini cublasStatus_t cerr; 2058ccdfe979SStefano Zampini 2059ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 2060ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 2061ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 2062ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 2063ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 2064ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 2065ccdfe979SStefano Zampini blda = B->cmap->n; 2066afb2bd1cSJunchao Zhang k = B->cmap->n; 2067afb2bd1cSJunchao Zhang } else { 2068afb2bd1cSJunchao Zhang k = B->rmap->n; 2069ccdfe979SStefano Zampini } 2070ccdfe979SStefano Zampini 2071afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2072ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 2073afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 2074ccdfe979SStefano Zampini csrmat->values->data().get(), 2075ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 2076ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 2077ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 2078ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 2079afb2bd1cSJunchao Zhang #endif 2080afb2bd1cSJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2081c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2082c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 2083ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 2084ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 2085ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2086ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2087ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 2088ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 2089ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2090ccdfe979SStefano Zampini } else { 2091ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 2092ccdfe979SStefano Zampini } 2093ccdfe979SStefano Zampini if (mmdata->cisdense) { 2094ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 2095ccdfe979SStefano Zampini } 2096ccdfe979SStefano Zampini if (!biscuda) { 2097ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2098ccdfe979SStefano Zampini } 2099ccdfe979SStefano Zampini PetscFunctionReturn(0); 2100ccdfe979SStefano Zampini } 2101ccdfe979SStefano Zampini 2102ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2103ccdfe979SStefano Zampini { 2104ccdfe979SStefano Zampini Mat_Product *product = C->product; 2105ccdfe979SStefano Zampini Mat A,B; 2106ccdfe979SStefano Zampini PetscInt m,n; 2107ccdfe979SStefano Zampini PetscBool cisdense,flg; 2108ccdfe979SStefano Zampini PetscErrorCode ierr; 2109ccdfe979SStefano Zampini MatMatCusparse *mmdata; 2110ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 2111ccdfe979SStefano Zampini 2112ccdfe979SStefano Zampini PetscFunctionBegin; 2113ccdfe979SStefano Zampini MatCheckProduct(C,1); 2114ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2115ccdfe979SStefano Zampini A = product->A; 2116ccdfe979SStefano Zampini B = product->B; 2117ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2118ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2119ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2120ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2121ccdfe979SStefano Zampini switch (product->type) { 2122ccdfe979SStefano Zampini case MATPRODUCT_AB: 2123ccdfe979SStefano Zampini m = A->rmap->n; 2124ccdfe979SStefano Zampini n = B->cmap->n; 2125ccdfe979SStefano Zampini break; 2126ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2127ccdfe979SStefano Zampini m = A->cmap->n; 2128ccdfe979SStefano Zampini n = B->cmap->n; 2129ccdfe979SStefano Zampini break; 2130ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2131ccdfe979SStefano Zampini m = A->rmap->n; 2132ccdfe979SStefano Zampini n = B->rmap->n; 2133ccdfe979SStefano Zampini break; 2134ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2135ccdfe979SStefano Zampini m = B->cmap->n; 2136ccdfe979SStefano Zampini n = B->cmap->n; 2137ccdfe979SStefano Zampini break; 2138ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2139ccdfe979SStefano Zampini m = B->rmap->n; 2140ccdfe979SStefano Zampini n = B->rmap->n; 2141ccdfe979SStefano Zampini break; 2142ccdfe979SStefano Zampini default: 2143ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2144ccdfe979SStefano Zampini } 2145ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2146ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2147ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 2148ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 2149ccdfe979SStefano Zampini 2150ccdfe979SStefano Zampini /* product data */ 2151ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2152ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 2153afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 2154afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2155ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2156afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 2157ccdfe979SStefano Zampini } 2158afb2bd1cSJunchao Zhang #endif 2159ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 2160ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2161ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 2162ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 2163ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2164ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 2165ccdfe979SStefano Zampini } else { 2166ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 2167ccdfe979SStefano Zampini } 2168ccdfe979SStefano Zampini } 2169ccdfe979SStefano Zampini C->product->data = mmdata; 2170ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2171ccdfe979SStefano Zampini 2172ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2173ccdfe979SStefano Zampini PetscFunctionReturn(0); 2174ccdfe979SStefano Zampini } 2175ccdfe979SStefano Zampini 2176fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2177ccdfe979SStefano Zampini { 2178ccdfe979SStefano Zampini Mat_Product *product = C->product; 2179fcdce8c4SStefano Zampini Mat A,B; 2180fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2181fcdce8c4SStefano Zampini Mat_SeqAIJ *c = (Mat_SeqAIJ*)C->data; 2182fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2183fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2184fcdce8c4SStefano Zampini PetscBool flg; 2185ccdfe979SStefano Zampini PetscErrorCode ierr; 2186fcdce8c4SStefano Zampini cusparseStatus_t stat; 2187fcdce8c4SStefano Zampini cudaError_t cerr; 2188fcdce8c4SStefano Zampini MatProductType ptype; 2189fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2190fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2191fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2192fcdce8c4SStefano Zampini #endif 2193ccdfe979SStefano Zampini 2194ccdfe979SStefano Zampini PetscFunctionBegin; 2195ccdfe979SStefano Zampini MatCheckProduct(C,1); 2196fcdce8c4SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 2197fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2198fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name); 2199fcdce8c4SStefano Zampini mmdata = (MatMatCusparse*)C->product->data; 2200fcdce8c4SStefano Zampini A = product->A; 2201fcdce8c4SStefano Zampini B = product->B; 2202fcdce8c4SStefano Zampini if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2203fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_FALSE; 2204fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2205fcdce8c4SStefano Zampini if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2206fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2207fcdce8c4SStefano Zampini if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]); 2208fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2209fcdce8c4SStefano Zampini if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2210fcdce8c4SStefano Zampini goto finalize; 2211fcdce8c4SStefano Zampini } 2212fcdce8c4SStefano Zampini if (!c->nz) goto finalize; 2213fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2214fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2215fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2216fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2217fcdce8c4SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2218fcdce8c4SStefano Zampini if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2219fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2220fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2221fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2222fcdce8c4SStefano Zampini if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2223fcdce8c4SStefano Zampini if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2224fcdce8c4SStefano Zampini if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2225fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2226fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2227fcdce8c4SStefano Zampini 2228fcdce8c4SStefano Zampini ptype = product->type; 2229fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2230fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2231fcdce8c4SStefano Zampini switch (ptype) { 2232fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2233fcdce8c4SStefano Zampini Amat = Acusp->mat; 2234fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2235fcdce8c4SStefano Zampini break; 2236fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2237fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2238fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2239fcdce8c4SStefano Zampini break; 2240fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2241fcdce8c4SStefano Zampini Amat = Acusp->mat; 2242fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2243fcdce8c4SStefano Zampini break; 2244fcdce8c4SStefano Zampini default: 2245fcdce8c4SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2246fcdce8c4SStefano Zampini } 2247fcdce8c4SStefano Zampini Cmat = Ccusp->mat; 2248fcdce8c4SStefano Zampini if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2249fcdce8c4SStefano Zampini if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2250fcdce8c4SStefano Zampini if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]); 2251fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2252fcdce8c4SStefano Zampini Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */ 2253fcdce8c4SStefano Zampini Ccsr = (CsrMatrix*)Cmat->mat; 2254fcdce8c4SStefano Zampini if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2255fcdce8c4SStefano Zampini if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2256fcdce8c4SStefano Zampini if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct"); 2257fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2258fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2259fcdce8c4SStefano Zampini BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2260fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2261fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2262fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2263fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2264fcdce8c4SStefano Zampini stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2265fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2266fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2267fcdce8c4SStefano Zampini #else 2268fcdce8c4SStefano Zampini stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2269fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2270fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2271fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2272fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2273fcdce8c4SStefano Zampini #endif 2274fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2275fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2276fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2277fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2278fcdce8c4SStefano Zampini finalize: 2279fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 2280fcdce8c4SStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 2281fcdce8c4SStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 2282fcdce8c4SStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 2283fcdce8c4SStefano Zampini c->reallocs = 0; 2284fcdce8c4SStefano Zampini C->info.mallocs += 0; 2285fcdce8c4SStefano Zampini C->info.nz_unneeded = 0; 2286fcdce8c4SStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 2287fcdce8c4SStefano Zampini C->num_ass++; 2288fcdce8c4SStefano Zampini /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 2289fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(C);CHKERRQ(ierr); 2290ccdfe979SStefano Zampini PetscFunctionReturn(0); 2291ccdfe979SStefano Zampini } 2292fcdce8c4SStefano Zampini 2293fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2294fcdce8c4SStefano Zampini { 2295fcdce8c4SStefano Zampini Mat_Product *product = C->product; 2296fcdce8c4SStefano Zampini Mat A,B; 2297fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp,*Bcusp,*Ccusp; 2298fcdce8c4SStefano Zampini Mat_SeqAIJ *a,*b,*c; 2299fcdce8c4SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat; 2300fcdce8c4SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 2301fcdce8c4SStefano Zampini PetscInt i,j,m,n,k; 2302fcdce8c4SStefano Zampini PetscBool flg; 2303fcdce8c4SStefano Zampini PetscErrorCode ierr; 2304fcdce8c4SStefano Zampini cusparseStatus_t stat; 2305fcdce8c4SStefano Zampini cudaError_t cerr; 2306fcdce8c4SStefano Zampini MatProductType ptype; 2307fcdce8c4SStefano Zampini MatMatCusparse *mmdata; 2308fcdce8c4SStefano Zampini PetscLogDouble flops; 2309fcdce8c4SStefano Zampini PetscBool biscompressed,ciscompressed; 2310fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2311fcdce8c4SStefano Zampini int64_t C_num_rows1, C_num_cols1, C_nnz1; 2312fcdce8c4SStefano Zampini size_t bufSize2; 2313fcdce8c4SStefano Zampini cusparseSpMatDescr_t BmatSpDescr; 2314fcdce8c4SStefano Zampini #else 2315fcdce8c4SStefano Zampini int cnz; 2316fcdce8c4SStefano Zampini #endif 2317fcdce8c4SStefano Zampini 2318fcdce8c4SStefano Zampini PetscFunctionBegin; 2319fcdce8c4SStefano Zampini MatCheckProduct(C,1); 2320fcdce8c4SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 2321fcdce8c4SStefano Zampini A = product->A; 2322fcdce8c4SStefano Zampini B = product->B; 2323fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2324fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 2325fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 2326fcdce8c4SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name); 2327fcdce8c4SStefano Zampini a = (Mat_SeqAIJ*)A->data; 2328fcdce8c4SStefano Zampini b = (Mat_SeqAIJ*)B->data; 2329fcdce8c4SStefano Zampini Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 2330fcdce8c4SStefano Zampini Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr; 2331fcdce8c4SStefano Zampini if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2332fcdce8c4SStefano Zampini if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 2333fcdce8c4SStefano Zampini 2334fcdce8c4SStefano Zampini /* product data */ 2335fcdce8c4SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 2336fcdce8c4SStefano Zampini C->product->data = mmdata; 2337fcdce8c4SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 2338fcdce8c4SStefano Zampini 2339fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2340fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 2341fcdce8c4SStefano Zampini ptype = product->type; 2342fcdce8c4SStefano Zampini if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 2343fcdce8c4SStefano Zampini if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB; 2344fcdce8c4SStefano Zampini biscompressed = PETSC_FALSE; 2345fcdce8c4SStefano Zampini ciscompressed = PETSC_FALSE; 2346fcdce8c4SStefano Zampini switch (ptype) { 2347fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2348fcdce8c4SStefano Zampini m = A->rmap->n; 2349fcdce8c4SStefano Zampini n = B->cmap->n; 2350fcdce8c4SStefano Zampini k = A->cmap->n; 2351fcdce8c4SStefano Zampini Amat = Acusp->mat; 2352fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2353fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2354fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2355fcdce8c4SStefano Zampini break; 2356fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2357fcdce8c4SStefano Zampini m = A->cmap->n; 2358fcdce8c4SStefano Zampini n = B->cmap->n; 2359fcdce8c4SStefano Zampini k = A->rmap->n; 2360fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 2361fcdce8c4SStefano Zampini Amat = Acusp->matTranspose; 2362fcdce8c4SStefano Zampini Bmat = Bcusp->mat; 2363fcdce8c4SStefano Zampini if (b->compressedrow.use) biscompressed = PETSC_TRUE; 2364fcdce8c4SStefano Zampini break; 2365fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2366fcdce8c4SStefano Zampini m = A->rmap->n; 2367fcdce8c4SStefano Zampini n = B->rmap->n; 2368fcdce8c4SStefano Zampini k = A->cmap->n; 2369fcdce8c4SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 2370fcdce8c4SStefano Zampini Amat = Acusp->mat; 2371fcdce8c4SStefano Zampini Bmat = Bcusp->matTranspose; 2372fcdce8c4SStefano Zampini if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 2373fcdce8c4SStefano Zampini break; 2374fcdce8c4SStefano Zampini default: 2375fcdce8c4SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 2376fcdce8c4SStefano Zampini } 2377fcdce8c4SStefano Zampini 2378fcdce8c4SStefano Zampini /* create cusparse matrix */ 2379fcdce8c4SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 2380fcdce8c4SStefano Zampini ierr = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2381fcdce8c4SStefano Zampini c = (Mat_SeqAIJ*)C->data; 2382fcdce8c4SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr; 2383fcdce8c4SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 2384fcdce8c4SStefano Zampini Ccsr = new CsrMatrix; 2385fcdce8c4SStefano Zampini 2386fcdce8c4SStefano Zampini c->compressedrow.use = ciscompressed; 2387fcdce8c4SStefano Zampini if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 2388fcdce8c4SStefano Zampini c->compressedrow.nrows = a->compressedrow.nrows; 2389fcdce8c4SStefano Zampini ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr); 2390fcdce8c4SStefano Zampini ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr); 2391fcdce8c4SStefano Zampini Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 2392fcdce8c4SStefano Zampini Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 2393fcdce8c4SStefano Zampini Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows); 2394fcdce8c4SStefano Zampini } else { 2395fcdce8c4SStefano Zampini c->compressedrow.nrows = 0; 2396fcdce8c4SStefano Zampini c->compressedrow.i = NULL; 2397fcdce8c4SStefano Zampini c->compressedrow.rindex = NULL; 2398fcdce8c4SStefano Zampini Ccusp->workVector = NULL; 2399fcdce8c4SStefano Zampini Cmat->cprowIndices = NULL; 2400fcdce8c4SStefano Zampini } 2401fcdce8c4SStefano Zampini Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 2402fcdce8c4SStefano Zampini Ccusp->mat = Cmat; 2403fcdce8c4SStefano Zampini Ccusp->mat->mat = Ccsr; 2404fcdce8c4SStefano Zampini Ccsr->num_rows = Ccusp->nrows; 2405fcdce8c4SStefano Zampini Ccsr->num_cols = n; 2406fcdce8c4SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1); 2407fcdce8c4SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 2408fcdce8c4SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 2409fcdce8c4SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 2410fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 2411fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 2412fcdce8c4SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 2413fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2414fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2415fcdce8c4SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 2416fcdce8c4SStefano Zampini if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 2417fcdce8c4SStefano Zampini thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0); 2418fcdce8c4SStefano Zampini c->nz = 0; 2419fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2420fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2421fcdce8c4SStefano Zampini goto finalizesym; 2422fcdce8c4SStefano Zampini } 2423fcdce8c4SStefano Zampini 2424fcdce8c4SStefano Zampini if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]); 2425fcdce8c4SStefano Zampini if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]); 2426fcdce8c4SStefano Zampini Acsr = (CsrMatrix*)Amat->mat; 2427fcdce8c4SStefano Zampini if (!biscompressed) { 2428fcdce8c4SStefano Zampini Bcsr = (CsrMatrix*)Bmat->mat; 2429fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2430fcdce8c4SStefano Zampini BmatSpDescr = Bmat->matDescr; 2431fcdce8c4SStefano Zampini #endif 2432fcdce8c4SStefano Zampini } else { /* we need to use row offsets for the full matrix */ 2433fcdce8c4SStefano Zampini CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat; 2434fcdce8c4SStefano Zampini Bcsr = new CsrMatrix; 2435fcdce8c4SStefano Zampini Bcsr->num_rows = B->rmap->n; 2436fcdce8c4SStefano Zampini Bcsr->num_cols = cBcsr->num_cols; 2437fcdce8c4SStefano Zampini Bcsr->num_entries = cBcsr->num_entries; 2438fcdce8c4SStefano Zampini Bcsr->column_indices = cBcsr->column_indices; 2439fcdce8c4SStefano Zampini Bcsr->values = cBcsr->values; 2440fcdce8c4SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 2441fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 2442fcdce8c4SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 2443fcdce8c4SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 2444fcdce8c4SStefano Zampini } 2445fcdce8c4SStefano Zampini Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 2446fcdce8c4SStefano Zampini mmdata->Bcsr = Bcsr; 2447fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2448fcdce8c4SStefano Zampini if (Bcsr->num_rows && Bcsr->num_cols) { 2449fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, 2450fcdce8c4SStefano Zampini Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2451fcdce8c4SStefano Zampini Bcsr->values->data().get(), 2452fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2453fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2454fcdce8c4SStefano Zampini } 2455fcdce8c4SStefano Zampini BmatSpDescr = mmdata->matSpBDescr; 2456fcdce8c4SStefano Zampini #endif 2457fcdce8c4SStefano Zampini } 2458fcdce8c4SStefano Zampini if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct"); 2459fcdce8c4SStefano Zampini if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct"); 2460fcdce8c4SStefano Zampini /* precompute flops count */ 2461fcdce8c4SStefano Zampini if (ptype == MATPRODUCT_AB) { 2462fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2463fcdce8c4SStefano Zampini const PetscInt st = a->i[i]; 2464fcdce8c4SStefano Zampini const PetscInt en = a->i[i+1]; 2465fcdce8c4SStefano Zampini for (j=st; j<en; j++) { 2466fcdce8c4SStefano Zampini const PetscInt brow = a->j[j]; 2467fcdce8c4SStefano Zampini flops += 2.*(b->i[brow+1] - b->i[brow]); 2468fcdce8c4SStefano Zampini } 2469fcdce8c4SStefano Zampini } 2470fcdce8c4SStefano Zampini } else if (ptype == MATPRODUCT_AtB) { 2471fcdce8c4SStefano Zampini for (i=0, flops = 0; i<A->rmap->n; i++) { 2472fcdce8c4SStefano Zampini const PetscInt anzi = a->i[i+1] - a->i[i]; 2473fcdce8c4SStefano Zampini const PetscInt bnzi = b->i[i+1] - b->i[i]; 2474fcdce8c4SStefano Zampini flops += (2.*anzi)*bnzi; 2475fcdce8c4SStefano Zampini } 2476fcdce8c4SStefano Zampini } else { /* TODO */ 2477fcdce8c4SStefano Zampini flops = 0.; 2478fcdce8c4SStefano Zampini } 2479fcdce8c4SStefano Zampini 2480fcdce8c4SStefano Zampini mmdata->flops = flops; 2481fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2482fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2483fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2484fcdce8c4SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, 2485fcdce8c4SStefano Zampini NULL, NULL, NULL, 2486fcdce8c4SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 2487fcdce8c4SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 2488fcdce8c4SStefano Zampini stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2489fcdce8c4SStefano Zampini /* ask bufferSize bytes for external memory */ 2490fcdce8c4SStefano Zampini stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2491fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2492fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2493fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat); 2494fcdce8c4SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUSPARSE(stat); 2495fcdce8c4SStefano Zampini /* inspect the matrices A and B to understand the memory requirement for the next step */ 2496fcdce8c4SStefano Zampini stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2497fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2498fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2499fcdce8c4SStefano Zampini mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat); 2500fcdce8c4SStefano Zampini /* ask bufferSize again bytes for external memory */ 2501fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2502fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2503fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2504fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat); 2505fcdce8c4SStefano Zampini /* The CUSPARSE documentation is not clear, nor the API 2506fcdce8c4SStefano Zampini We need both buffers to perform the operations properly! 2507fcdce8c4SStefano Zampini mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 2508fcdce8c4SStefano Zampini it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 2509fcdce8c4SStefano Zampini is stored in the descriptor! What a messy API... */ 2510fcdce8c4SStefano Zampini cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUSPARSE(stat); 2511fcdce8c4SStefano Zampini /* compute the intermediate product of A * B */ 2512fcdce8c4SStefano Zampini stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2513fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2514fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, 2515fcdce8c4SStefano Zampini mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat); 2516fcdce8c4SStefano Zampini /* get matrix C non-zero entries C_nnz1 */ 2517fcdce8c4SStefano Zampini stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat); 2518fcdce8c4SStefano Zampini c->nz = (PetscInt) C_nnz1; 2519fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2520fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2521fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2522fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2523fcdce8c4SStefano Zampini stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), 2524fcdce8c4SStefano Zampini Ccsr->values->data().get());CHKERRCUSPARSE(stat); 2525fcdce8c4SStefano Zampini stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2526fcdce8c4SStefano Zampini Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, 2527fcdce8c4SStefano Zampini cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat); 2528fcdce8c4SStefano Zampini #else 2529fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat); 2530fcdce8c4SStefano Zampini stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2531fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2532fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2533fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2534fcdce8c4SStefano Zampini Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat); 2535fcdce8c4SStefano Zampini c->nz = cnz; 2536fcdce8c4SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 2537fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2538fcdce8c4SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 2539fcdce8c4SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 2540fcdce8c4SStefano Zampini 2541fcdce8c4SStefano Zampini stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 2542fcdce8c4SStefano Zampini /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 2543fcdce8c4SStefano Zampini I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 2544fcdce8c4SStefano Zampini D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 2545fcdce8c4SStefano Zampini stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, 2546fcdce8c4SStefano Zampini Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, 2547fcdce8c4SStefano Zampini Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), 2548fcdce8c4SStefano Zampini Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), 2549fcdce8c4SStefano Zampini Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat); 2550fcdce8c4SStefano Zampini #endif 2551fcdce8c4SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 2552fcdce8c4SStefano Zampini ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr); 2553fcdce8c4SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2554fcdce8c4SStefano Zampini finalizesym: 2555fcdce8c4SStefano Zampini c->singlemalloc = PETSC_FALSE; 2556fcdce8c4SStefano Zampini c->free_a = PETSC_TRUE; 2557fcdce8c4SStefano Zampini c->free_ij = PETSC_TRUE; 2558fcdce8c4SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 2559fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 2560fcdce8c4SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 2561fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2562fcdce8c4SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 2563fcdce8c4SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 2564fcdce8c4SStefano Zampini ii = *Ccsr->row_offsets; 2565fcdce8c4SStefano Zampini jj = *Ccsr->column_indices; 2566fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2567fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2568fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2569fcdce8c4SStefano Zampini } else { 2570fcdce8c4SStefano Zampini PetscInt *d_i = c->i; 2571fcdce8c4SStefano Zampini if (ciscompressed) d_i = c->compressedrow.i; 2572fcdce8c4SStefano Zampini cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2573fcdce8c4SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 2574fcdce8c4SStefano Zampini } 2575fcdce8c4SStefano Zampini if (ciscompressed) { /* need to expand host row offsets */ 2576fcdce8c4SStefano Zampini PetscInt r = 0; 2577fcdce8c4SStefano Zampini c->i[0] = 0; 2578fcdce8c4SStefano Zampini for (k = 0; k < c->compressedrow.nrows; k++) { 2579fcdce8c4SStefano Zampini const PetscInt next = c->compressedrow.rindex[k]; 2580fcdce8c4SStefano Zampini const PetscInt old = c->compressedrow.i[k]; 2581fcdce8c4SStefano Zampini for (; r < next; r++) c->i[r+1] = old; 2582fcdce8c4SStefano Zampini } 2583fcdce8c4SStefano Zampini for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows]; 2584fcdce8c4SStefano Zampini } 2585fcdce8c4SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 2586fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 2587fcdce8c4SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 2588fcdce8c4SStefano Zampini c->maxnz = c->nz; 2589fcdce8c4SStefano Zampini c->nonzerorowcnt = 0; 2590fcdce8c4SStefano Zampini c->rmax = 0; 2591fcdce8c4SStefano Zampini for (k = 0; k < m; k++) { 2592fcdce8c4SStefano Zampini const PetscInt nn = c->i[k+1] - c->i[k]; 2593fcdce8c4SStefano Zampini c->ilen[k] = c->imax[k] = nn; 2594fcdce8c4SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 2595fcdce8c4SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 2596fcdce8c4SStefano Zampini } 2597fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr); 2598fcdce8c4SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 2599fcdce8c4SStefano Zampini Ccsr->num_entries = c->nz; 2600fcdce8c4SStefano Zampini 2601fcdce8c4SStefano Zampini C->nonzerostate++; 2602fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 2603fcdce8c4SStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 2604fcdce8c4SStefano Zampini Ccusp->nonzerostate = C->nonzerostate; 2605fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 2606fcdce8c4SStefano Zampini C->preallocated = PETSC_TRUE; 2607fcdce8c4SStefano Zampini C->assembled = PETSC_FALSE; 2608fcdce8c4SStefano Zampini C->was_assembled = PETSC_FALSE; 2609fcdce8c4SStefano Zampini if (product->api_user) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 2610fcdce8c4SStefano Zampini mmdata->reusesym = PETSC_TRUE; 2611fcdce8c4SStefano Zampini C->offloadmask = PETSC_OFFLOAD_GPU; 2612fcdce8c4SStefano Zampini } 2613fcdce8c4SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2614fcdce8c4SStefano Zampini PetscFunctionReturn(0); 2615fcdce8c4SStefano Zampini } 2616fcdce8c4SStefano Zampini 2617fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2618fcdce8c4SStefano Zampini 2619fcdce8c4SStefano Zampini /* handles sparse or dense B */ 2620fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 2621fcdce8c4SStefano Zampini { 2622fcdce8c4SStefano Zampini Mat_Product *product = mat->product; 2623fcdce8c4SStefano Zampini PetscErrorCode ierr; 2624fcdce8c4SStefano Zampini PetscBool isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE; 2625fcdce8c4SStefano Zampini 2626fcdce8c4SStefano Zampini PetscFunctionBegin; 2627fcdce8c4SStefano Zampini MatCheckProduct(mat,1); 2628fcdce8c4SStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 2629fcdce8c4SStefano Zampini if (!product->B->boundtocpu) { 2630fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 2631fcdce8c4SStefano Zampini } 2632fcdce8c4SStefano Zampini if (product->type == MATPRODUCT_ABC) { 2633fcdce8c4SStefano Zampini Ciscusp = PETSC_FALSE; 2634fcdce8c4SStefano Zampini if (!product->C->boundtocpu) { 2635fcdce8c4SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr); 2636fcdce8c4SStefano Zampini } 2637fcdce8c4SStefano Zampini } 2638fcdce8c4SStefano Zampini if (isdense) { 2639ccdfe979SStefano Zampini switch (product->type) { 2640ccdfe979SStefano Zampini case MATPRODUCT_AB: 2641ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2642ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2643ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2644ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2645fcdce8c4SStefano Zampini if (product->A->boundtocpu) { 2646fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr); 2647fcdce8c4SStefano Zampini } else { 2648fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2649fcdce8c4SStefano Zampini } 2650fcdce8c4SStefano Zampini break; 2651fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2652fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2653fcdce8c4SStefano Zampini break; 2654ccdfe979SStefano Zampini default: 2655ccdfe979SStefano Zampini break; 2656ccdfe979SStefano Zampini } 2657fcdce8c4SStefano Zampini } else if (Biscusp && Ciscusp) { 2658fcdce8c4SStefano Zampini switch (product->type) { 2659fcdce8c4SStefano Zampini case MATPRODUCT_AB: 2660fcdce8c4SStefano Zampini case MATPRODUCT_AtB: 2661fcdce8c4SStefano Zampini case MATPRODUCT_ABt: 2662fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 2663fcdce8c4SStefano Zampini break; 2664fcdce8c4SStefano Zampini case MATPRODUCT_PtAP: 2665fcdce8c4SStefano Zampini case MATPRODUCT_RARt: 2666fcdce8c4SStefano Zampini case MATPRODUCT_ABC: 2667fcdce8c4SStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 2668fcdce8c4SStefano Zampini break; 2669fcdce8c4SStefano Zampini default: 2670fcdce8c4SStefano Zampini break; 2671fcdce8c4SStefano Zampini } 2672fcdce8c4SStefano Zampini } else { /* fallback for AIJ */ 2673fcdce8c4SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 2674fcdce8c4SStefano Zampini } 2675ccdfe979SStefano Zampini PetscFunctionReturn(0); 2676ccdfe979SStefano Zampini } 2677ccdfe979SStefano Zampini 26786fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 26799ae82921SPaul Mullowney { 2680b175d8bbSPaul Mullowney PetscErrorCode ierr; 26819ae82921SPaul Mullowney 26829ae82921SPaul Mullowney PetscFunctionBegin; 2683e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2684e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2685e6e9a74fSStefano Zampini } 2686e6e9a74fSStefano Zampini 2687e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2688e6e9a74fSStefano Zampini { 2689e6e9a74fSStefano Zampini PetscErrorCode ierr; 2690e6e9a74fSStefano Zampini 2691e6e9a74fSStefano Zampini PetscFunctionBegin; 2692e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2693e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2694e6e9a74fSStefano Zampini } 2695e6e9a74fSStefano Zampini 2696e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2697e6e9a74fSStefano Zampini { 2698e6e9a74fSStefano Zampini PetscErrorCode ierr; 2699e6e9a74fSStefano Zampini 2700e6e9a74fSStefano Zampini PetscFunctionBegin; 2701e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2702e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2703e6e9a74fSStefano Zampini } 2704e6e9a74fSStefano Zampini 2705e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2706e6e9a74fSStefano Zampini { 2707e6e9a74fSStefano Zampini PetscErrorCode ierr; 2708e6e9a74fSStefano Zampini 2709e6e9a74fSStefano Zampini PetscFunctionBegin; 2710e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 27119ae82921SPaul Mullowney PetscFunctionReturn(0); 27129ae82921SPaul Mullowney } 27139ae82921SPaul Mullowney 27146fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2715ca45077fSPaul Mullowney { 2716b175d8bbSPaul Mullowney PetscErrorCode ierr; 2717ca45077fSPaul Mullowney 2718ca45077fSPaul Mullowney PetscFunctionBegin; 2719e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2720ca45077fSPaul Mullowney PetscFunctionReturn(0); 2721ca45077fSPaul Mullowney } 2722ca45077fSPaul Mullowney 2723afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2724e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 27259ae82921SPaul Mullowney { 27269ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2727aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 27289ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2729e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2730b175d8bbSPaul Mullowney PetscErrorCode ierr; 273157d48284SJunchao Zhang cudaError_t cerr; 2732aa372e3fSPaul Mullowney cusparseStatus_t stat; 2733e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2734e6e9a74fSStefano Zampini PetscBool compressed; 2735afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2736afb2bd1cSJunchao Zhang PetscInt nx,ny; 2737afb2bd1cSJunchao Zhang #endif 27386e111a19SKarl Rupp 27399ae82921SPaul Mullowney PetscFunctionBegin; 2740e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2741e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2742afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2743d38a13f6SStefano Zampini else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);} 2744e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2745e6e9a74fSStefano Zampini } 274634d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 274734d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2748e6e9a74fSStefano Zampini if (!trans) { 27499ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2750c9567895SMark if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 2751e6e9a74fSStefano Zampini } else { 2752e6e9a74fSStefano Zampini if (herm || !cusparsestruct->transgen) { 2753e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2754e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2755e6e9a74fSStefano Zampini } else { 2756afb2bd1cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2757e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2758e6e9a74fSStefano Zampini } 2759e6e9a74fSStefano Zampini } 2760e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2761e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2762213423ffSJunchao Zhang 2763e6e9a74fSStefano Zampini try { 2764e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2765213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2766213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2767afb2bd1cSJunchao Zhang 276885ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2769e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2770afb2bd1cSJunchao Zhang /* z = A x + beta y. 2771afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2772afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2773afb2bd1cSJunchao Zhang */ 2774e6e9a74fSStefano Zampini xptr = xarray; 2775afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2776213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2777afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2778afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2779afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2780afb2bd1cSJunchao Zhang */ 2781afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2782afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2783afb2bd1cSJunchao Zhang nx = mat->num_cols; 2784afb2bd1cSJunchao Zhang ny = mat->num_rows; 2785afb2bd1cSJunchao Zhang } 2786afb2bd1cSJunchao Zhang #endif 2787e6e9a74fSStefano Zampini } else { 2788afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2789afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2790afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2791afb2bd1cSJunchao Zhang */ 2792afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2793e6e9a74fSStefano Zampini dptr = zarray; 2794e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2795afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2796e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2797e6e9a74fSStefano Zampini thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2798e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2799e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2800e6e9a74fSStefano Zampini } 2801afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2802afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2803afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2804afb2bd1cSJunchao Zhang nx = mat->num_rows; 2805afb2bd1cSJunchao Zhang ny = mat->num_cols; 2806afb2bd1cSJunchao Zhang } 2807afb2bd1cSJunchao Zhang #endif 2808e6e9a74fSStefano Zampini } 28099ae82921SPaul Mullowney 2810afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2811aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2812afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2813afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2814afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2815afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2816afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2817afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2818afb2bd1cSJunchao Zhang matstruct->matDescr, 2819afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2820afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2821afb2bd1cSJunchao Zhang cusparse_scalartype, 2822afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2823afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2824afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2825afb2bd1cSJunchao Zhang 2826afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2827afb2bd1cSJunchao Zhang } else { 2828afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2829afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2830afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2831afb2bd1cSJunchao Zhang } 2832afb2bd1cSJunchao Zhang 2833afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2834afb2bd1cSJunchao Zhang matstruct->alpha_one, 2835afb2bd1cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2836afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2837afb2bd1cSJunchao Zhang beta, 2838afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2839afb2bd1cSJunchao Zhang cusparse_scalartype, 2840afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2841afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2842afb2bd1cSJunchao Zhang #else 28437656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2844e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2845a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2846afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2847aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2848e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 284957d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2850afb2bd1cSJunchao Zhang #endif 2851aa372e3fSPaul Mullowney } else { 2852213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2853afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2854afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2855afb2bd1cSJunchao Zhang #else 2856301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2857e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2858afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2859e6e9a74fSStefano Zampini xptr, beta, 286057d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2861afb2bd1cSJunchao Zhang #endif 2862a65300a6SPaul Mullowney } 2863aa372e3fSPaul Mullowney } 286405035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2865958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2866aa372e3fSPaul Mullowney 2867e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2868213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2869213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2870213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2871e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2872213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 28737656d835SStefano Zampini } 2874213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2875c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 28767656d835SStefano Zampini } 28777656d835SStefano Zampini 2878213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2879213423ffSJunchao Zhang if (compressed) { 2880213423ffSJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2881e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2882c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2883e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2884c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 288505035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2886958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2887e6e9a74fSStefano Zampini } 2888e6e9a74fSStefano Zampini } else { 2889e6e9a74fSStefano Zampini if (yy && yy != zz) { 2890e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2891e6e9a74fSStefano Zampini } 2892e6e9a74fSStefano Zampini } 2893e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2894213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2895213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 28969ae82921SPaul Mullowney } catch(char *ex) { 28979ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 28989ae82921SPaul Mullowney } 2899e6e9a74fSStefano Zampini if (yy) { 2900958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2901e6e9a74fSStefano Zampini } else { 2902e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2903e6e9a74fSStefano Zampini } 29049ae82921SPaul Mullowney PetscFunctionReturn(0); 29059ae82921SPaul Mullowney } 29069ae82921SPaul Mullowney 29076fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2908ca45077fSPaul Mullowney { 2909b175d8bbSPaul Mullowney PetscErrorCode ierr; 29106e111a19SKarl Rupp 2911ca45077fSPaul Mullowney PetscFunctionBegin; 2912e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2913ca45077fSPaul Mullowney PetscFunctionReturn(0); 2914ca45077fSPaul Mullowney } 2915ca45077fSPaul Mullowney 29166fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 29179ae82921SPaul Mullowney { 29189ae82921SPaul Mullowney PetscErrorCode ierr; 2919a587d139SMark PetscSplitCSRDataStructure *d_mat = NULL; 29209ae82921SPaul Mullowney PetscFunctionBegin; 2921bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 29223fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2923bc3f50f2SPaul Mullowney } 29243fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 29253fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2926a587d139SMark if (d_mat) { 29273fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 29283fa6b06aSMark Adams } 29293fa6b06aSMark Adams 29309ae82921SPaul Mullowney PetscFunctionReturn(0); 29319ae82921SPaul Mullowney } 29329ae82921SPaul Mullowney 29339ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2934e057df02SPaul Mullowney /*@ 29359ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2936e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2937e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2938e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2939e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2940e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 29419ae82921SPaul Mullowney 2942d083f849SBarry Smith Collective 29439ae82921SPaul Mullowney 29449ae82921SPaul Mullowney Input Parameters: 29459ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 29469ae82921SPaul Mullowney . m - number of rows 29479ae82921SPaul Mullowney . n - number of columns 29489ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 29499ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 29500298fd71SBarry Smith (possibly different for each row) or NULL 29519ae82921SPaul Mullowney 29529ae82921SPaul Mullowney Output Parameter: 29539ae82921SPaul Mullowney . A - the matrix 29549ae82921SPaul Mullowney 29559ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 29569ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 29579ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 29589ae82921SPaul Mullowney 29599ae82921SPaul Mullowney Notes: 29609ae82921SPaul Mullowney If nnz is given then nz is ignored 29619ae82921SPaul Mullowney 29629ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 29639ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 29649ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 29659ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 29669ae82921SPaul Mullowney 29679ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 29680298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 29699ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 29709ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 29719ae82921SPaul Mullowney 29729ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 29739ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 29749ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 29759ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 29769ae82921SPaul Mullowney 29779ae82921SPaul Mullowney Level: intermediate 29789ae82921SPaul Mullowney 2979e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 29809ae82921SPaul Mullowney @*/ 29819ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 29829ae82921SPaul Mullowney { 29839ae82921SPaul Mullowney PetscErrorCode ierr; 29849ae82921SPaul Mullowney 29859ae82921SPaul Mullowney PetscFunctionBegin; 29869ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 29879ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 29889ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 29899ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 29909ae82921SPaul Mullowney PetscFunctionReturn(0); 29919ae82921SPaul Mullowney } 29929ae82921SPaul Mullowney 29936fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 29949ae82921SPaul Mullowney { 29959ae82921SPaul Mullowney PetscErrorCode ierr; 29963fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 2997ab25e6cbSDominic Meiser 29989ae82921SPaul Mullowney PetscFunctionBegin; 29999ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 30003fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 30013fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 3002470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 30039ae82921SPaul Mullowney } else { 3004470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 3005aa372e3fSPaul Mullowney } 30063fa6b06aSMark Adams if (d_mat) { 30073fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 30083fa6b06aSMark Adams cudaError_t err; 30093fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 30103fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 30113fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 30123fa6b06aSMark Adams if (a->compressedrow.use) { 30133fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 30143fa6b06aSMark Adams } 30153fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 30163fa6b06aSMark Adams } 3017ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 3018ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3019ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3020fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3021ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 30227e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 30237e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 30249ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 30259ae82921SPaul Mullowney PetscFunctionReturn(0); 30269ae82921SPaul Mullowney } 30279ae82921SPaul Mullowney 3028ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 302995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 30309ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 30319ff858a8SKarl Rupp { 30329ff858a8SKarl Rupp PetscErrorCode ierr; 30339ff858a8SKarl Rupp 30349ff858a8SKarl Rupp PetscFunctionBegin; 30359ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 3036ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 30379ff858a8SKarl Rupp PetscFunctionReturn(0); 30389ff858a8SKarl Rupp } 30399ff858a8SKarl Rupp 3040a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str) // put axpy in aijcusparse, etc. 304195639643SRichard Tran Mills { 3042e6e9a74fSStefano Zampini PetscErrorCode ierr; 3043a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 3044a587d139SMark PetscBool flgx,flgy; 3045e6e9a74fSStefano Zampini 304695639643SRichard Tran Mills PetscFunctionBegin; 3047a587d139SMark if (a == (PetscScalar)0.0) PetscFunctionReturn(0); 3048a587d139SMark PetscValidHeaderSpecific(Y,MAT_CLASSID,1); 3049a587d139SMark PetscValidHeaderSpecific(X,MAT_CLASSID,3); 3050a587d139SMark ierr = PetscObjectTypeCompare((PetscObject)Y,MATSEQAIJCUSPARSE,&flgy);CHKERRQ(ierr); 3051a587d139SMark ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQAIJCUSPARSE,&flgx);CHKERRQ(ierr); 3052a587d139SMark if (!flgx || !flgy) { 3053a587d139SMark ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3054a587d139SMark PetscFunctionReturn(0); 305595639643SRichard Tran Mills } 3056a587d139SMark if (Y->factortype != MAT_FACTOR_NONE || X->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"both matrices must be MAT_FACTOR_NONE"); 3057a587d139SMark if (str == DIFFERENT_NONZERO_PATTERN) { 3058a587d139SMark if (x->nz == y->nz) { 3059a587d139SMark PetscBool e; 3060a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 3061a587d139SMark if (e) { 3062a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 3063a587d139SMark if (e) { 3064a587d139SMark str = SAME_NONZERO_PATTERN; 3065a587d139SMark } 3066a587d139SMark } 3067a587d139SMark } 3068a587d139SMark } 3069a587d139SMark if (str != SAME_NONZERO_PATTERN) { 3070a587d139SMark ierr = MatAXPY_SeqAIJ( Y, a, X, str);CHKERRQ(ierr); 3071a587d139SMark PetscFunctionReturn(0); 3072a587d139SMark } else { 3073a587d139SMark Mat_SeqAIJCUSPARSE *cusparsestruct_y = (Mat_SeqAIJCUSPARSE*)Y->spptr; 3074a587d139SMark Mat_SeqAIJCUSPARSE *cusparsestruct_x = (Mat_SeqAIJCUSPARSE*)X->spptr; 3075a587d139SMark if (cusparsestruct_y->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3076a587d139SMark if (cusparsestruct_x->format!=MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported"); 3077a587d139SMark if (!cusparsestruct_y->mat || !cusparsestruct_x->mat) { 3078a587d139SMark if (Y->offloadmask == PETSC_OFFLOAD_UNALLOCATED || Y->offloadmask == PETSC_OFFLOAD_GPU) { 3079a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3080a587d139SMark } 3081a587d139SMark if (X->offloadmask == PETSC_OFFLOAD_UNALLOCATED || X->offloadmask == PETSC_OFFLOAD_GPU) { 3082a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(X);CHKERRQ(ierr); 3083a587d139SMark } 3084a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 3085a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr); 3086a587d139SMark ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr); 3087a587d139SMark } else { 3088a587d139SMark cublasHandle_t cublasv2handle; 3089a587d139SMark cublasStatus_t cberr; 3090a587d139SMark cudaError_t err; 3091a587d139SMark PetscScalar alpha = a; 3092a587d139SMark PetscBLASInt one = 1, bnz = 1; 3093a587d139SMark CsrMatrix *matrix_y = (CsrMatrix*)cusparsestruct_y->mat->mat; 3094a587d139SMark CsrMatrix *matrix_x = (CsrMatrix*)cusparsestruct_x->mat->mat; 3095a587d139SMark PetscScalar *aa_y, *aa_x; 3096a587d139SMark aa_y = matrix_y->values->data().get(); 3097a587d139SMark aa_x = matrix_x->values->data().get(); 3098a587d139SMark ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 3099a587d139SMark ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr); 3100a587d139SMark ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3101a587d139SMark cberr = cublasXaxpy(cublasv2handle,bnz,&alpha,aa_x,one,aa_y,one);CHKERRCUBLAS(cberr); 3102a587d139SMark err = WaitForCUDA();CHKERRCUDA(err); 3103a587d139SMark ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr); 3104a587d139SMark ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3105a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr); 3106a587d139SMark ierr = PetscObjectStateIncrease((PetscObject)Y);CHKERRQ(ierr); 3107a587d139SMark if (Y->offloadmask == PETSC_OFFLOAD_BOTH) Y->offloadmask = PETSC_OFFLOAD_GPU; 3108a587d139SMark else if (Y->offloadmask != PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"wrong state"); 3109a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(Y);CHKERRQ(ierr); 3110a587d139SMark } 3111a587d139SMark } 311295639643SRichard Tran Mills PetscFunctionReturn(0); 311395639643SRichard Tran Mills } 311495639643SRichard Tran Mills 31153fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 31163fa6b06aSMark Adams { 31173fa6b06aSMark Adams PetscErrorCode ierr; 31187e8381f9SStefano Zampini PetscBool both = PETSC_FALSE; 3119a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 31207e8381f9SStefano Zampini 31213fa6b06aSMark Adams PetscFunctionBegin; 31223fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 31233fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 31247e8381f9SStefano Zampini if (spptr->mat) { 31257e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat; 31267e8381f9SStefano Zampini if (matrix->values) { 31277e8381f9SStefano Zampini both = PETSC_TRUE; 31287e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 31297e8381f9SStefano Zampini } 31307e8381f9SStefano Zampini } 31317e8381f9SStefano Zampini if (spptr->matTranspose) { 31327e8381f9SStefano Zampini CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat; 31337e8381f9SStefano Zampini if (matrix->values) { 31347e8381f9SStefano Zampini thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 31357e8381f9SStefano Zampini } 31367e8381f9SStefano Zampini } 31373fa6b06aSMark Adams } 3138a587d139SMark //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 3139a587d139SMark ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr); 3140a587d139SMark ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 31417e8381f9SStefano Zampini if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 3142a587d139SMark else A->offloadmask = PETSC_OFFLOAD_CPU; 31433fa6b06aSMark Adams 31443fa6b06aSMark Adams PetscFunctionReturn(0); 31453fa6b06aSMark Adams } 31463fa6b06aSMark Adams 3147a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 3148a587d139SMark { 3149a587d139SMark Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3150a587d139SMark PetscErrorCode ierr; 3151a587d139SMark 3152a587d139SMark PetscFunctionBegin; 3153a587d139SMark if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 3154a587d139SMark if (flg) { 3155a587d139SMark ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 3156a587d139SMark 3157a587d139SMark A->ops->axpy = MatAXPY_SeqAIJ; 3158a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJ; 3159a587d139SMark A->ops->mult = MatMult_SeqAIJ; 3160a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJ; 3161a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJ; 3162a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 3163a587d139SMark A->ops->multhermitiantranspose = NULL; 3164a587d139SMark A->ops->multhermitiantransposeadd = NULL; 3165fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 3166a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 3167a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 3168a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 3169a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 3170a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr); 3171fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr); 3172a587d139SMark } else { 3173a587d139SMark A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 3174a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 3175a587d139SMark A->ops->mult = MatMult_SeqAIJCUSPARSE; 3176a587d139SMark A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 3177a587d139SMark A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 3178a587d139SMark A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 3179a587d139SMark A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 3180a587d139SMark A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 3181fcdce8c4SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 3182a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3183a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3184a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3185a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr); 3186a587d139SMark ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr); 3187fcdce8c4SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 3188a587d139SMark } 3189a587d139SMark A->boundtocpu = flg; 3190a587d139SMark a->inode.use = flg; 3191a587d139SMark PetscFunctionReturn(0); 3192a587d139SMark } 3193a587d139SMark 319449735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 31959ae82921SPaul Mullowney { 31969ae82921SPaul Mullowney PetscErrorCode ierr; 3197aa372e3fSPaul Mullowney cusparseStatus_t stat; 319849735bf3SStefano Zampini Mat B; 31999ae82921SPaul Mullowney 32009ae82921SPaul Mullowney PetscFunctionBegin; 3201832b2c02SStefano Zampini ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */ 320249735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 320349735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 320449735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 320549735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 320649735bf3SStefano Zampini } 320749735bf3SStefano Zampini B = *newmat; 320849735bf3SStefano Zampini 320934136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 321034136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 321134136279SStefano Zampini 321249735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 32139ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 3214e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 3215e6e9a74fSStefano Zampini 3216e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3217e6e9a74fSStefano Zampini spptr->format = MAT_CUSPARSE_CSR; 3218e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3219e6e9a74fSStefano Zampini B->spptr = spptr; 32203fa6b06aSMark Adams spptr->deviceMat = NULL; 32219ae82921SPaul Mullowney } else { 3222e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 3223e6e9a74fSStefano Zampini 3224e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 3225e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 3226e6e9a74fSStefano Zampini B->spptr = spptr; 32279ae82921SPaul Mullowney } 3228e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 322949735bf3SStefano Zampini } 3230693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 32319ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 32329ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 323395639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 3234693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 32352205254eSKarl Rupp 3236e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 32379ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3238bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 32399ae82921SPaul Mullowney PetscFunctionReturn(0); 32409ae82921SPaul Mullowney } 32419ae82921SPaul Mullowney 324202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 324302fe1965SBarry Smith { 324402fe1965SBarry Smith PetscErrorCode ierr; 324502fe1965SBarry Smith 324602fe1965SBarry Smith PetscFunctionBegin; 324702fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 32480ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 3249afb2bd1cSJunchao Zhang ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 3250afb2bd1cSJunchao Zhang ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 3251afb2bd1cSJunchao Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 325202fe1965SBarry Smith PetscFunctionReturn(0); 325302fe1965SBarry Smith } 325402fe1965SBarry Smith 32553ca39a21SBarry Smith /*MC 3256e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 3257e057df02SPaul Mullowney 3258e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 32592692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 32602692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3261e057df02SPaul Mullowney 3262e057df02SPaul Mullowney Options Database Keys: 3263e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 3264aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3265a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3266e057df02SPaul Mullowney 3267e057df02SPaul Mullowney Level: beginner 3268e057df02SPaul Mullowney 32698468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3270e057df02SPaul Mullowney M*/ 32717f756511SDominic Meiser 327242c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 327342c9c57cSBarry Smith 32740f39cd5aSBarry Smith 32753ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 327642c9c57cSBarry Smith { 327742c9c57cSBarry Smith PetscErrorCode ierr; 327842c9c57cSBarry Smith 327942c9c57cSBarry Smith PetscFunctionBegin; 32803ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 32813ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 32823ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 32833ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 328442c9c57cSBarry Smith PetscFunctionReturn(0); 328542c9c57cSBarry Smith } 328629b38603SBarry Smith 3287470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 32887f756511SDominic Meiser { 3289e6e9a74fSStefano Zampini PetscErrorCode ierr; 32907f756511SDominic Meiser cusparseStatus_t stat; 32917f756511SDominic Meiser 32927f756511SDominic Meiser PetscFunctionBegin; 32937f756511SDominic Meiser if (*cusparsestruct) { 3294e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 3295e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 32967f756511SDominic Meiser delete (*cusparsestruct)->workVector; 329781902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 32987e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm; 32997e8381f9SStefano Zampini delete (*cusparsestruct)->cooPerm_a; 33007e8381f9SStefano Zampini if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);} 3301afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3302afb2bd1cSJunchao Zhang cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 3303afb2bd1cSJunchao Zhang #endif 3304e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 33057f756511SDominic Meiser } 33067f756511SDominic Meiser PetscFunctionReturn(0); 33077f756511SDominic Meiser } 33087f756511SDominic Meiser 33097f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 33107f756511SDominic Meiser { 33117f756511SDominic Meiser PetscFunctionBegin; 33127f756511SDominic Meiser if (*mat) { 33137f756511SDominic Meiser delete (*mat)->values; 33147f756511SDominic Meiser delete (*mat)->column_indices; 33157f756511SDominic Meiser delete (*mat)->row_offsets; 33167f756511SDominic Meiser delete *mat; 33177f756511SDominic Meiser *mat = 0; 33187f756511SDominic Meiser } 33197f756511SDominic Meiser PetscFunctionReturn(0); 33207f756511SDominic Meiser } 33217f756511SDominic Meiser 3322470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 33237f756511SDominic Meiser { 33247f756511SDominic Meiser cusparseStatus_t stat; 33257f756511SDominic Meiser PetscErrorCode ierr; 33267f756511SDominic Meiser 33277f756511SDominic Meiser PetscFunctionBegin; 33287f756511SDominic Meiser if (*trifactor) { 332957d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 3330afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 33317f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 33321b0a6780SStefano Zampini if ((*trifactor)->solveBuffer) {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 33332cbc15d9SMark if ((*trifactor)->AA_h) {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);} 3334afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 33351b0a6780SStefano Zampini if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 3336afb2bd1cSJunchao Zhang #endif 3337da79fbbcSStefano Zampini ierr = PetscFree(*trifactor);CHKERRQ(ierr); 33387f756511SDominic Meiser } 33397f756511SDominic Meiser PetscFunctionReturn(0); 33407f756511SDominic Meiser } 33417f756511SDominic Meiser 3342470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 33437f756511SDominic Meiser { 33447f756511SDominic Meiser CsrMatrix *mat; 33457f756511SDominic Meiser cusparseStatus_t stat; 33467f756511SDominic Meiser cudaError_t err; 33477f756511SDominic Meiser 33487f756511SDominic Meiser PetscFunctionBegin; 33497f756511SDominic Meiser if (*matstruct) { 33507f756511SDominic Meiser if ((*matstruct)->mat) { 33517f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 3352afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3353afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3354afb2bd1cSJunchao Zhang #else 33557f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 335657d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 3357afb2bd1cSJunchao Zhang #endif 33587f756511SDominic Meiser } else { 33597f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 33607f756511SDominic Meiser CsrMatrix_Destroy(&mat); 33617f756511SDominic Meiser } 33627f756511SDominic Meiser } 336357d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 33647f756511SDominic Meiser delete (*matstruct)->cprowIndices; 3365afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 33667656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 33677656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 3368afb2bd1cSJunchao Zhang 3369afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3370afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 3371afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 3372afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 3373afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 3374afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 3375afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 3376afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 3377afb2bd1cSJunchao Zhang } 3378afb2bd1cSJunchao Zhang } 3379afb2bd1cSJunchao Zhang #endif 33807f756511SDominic Meiser delete *matstruct; 33817e8381f9SStefano Zampini *matstruct = NULL; 33827f756511SDominic Meiser } 33837f756511SDominic Meiser PetscFunctionReturn(0); 33847f756511SDominic Meiser } 33857f756511SDominic Meiser 3386ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 33877f756511SDominic Meiser { 3388e6e9a74fSStefano Zampini PetscErrorCode ierr; 3389e6e9a74fSStefano Zampini 33907f756511SDominic Meiser PetscFunctionBegin; 33917f756511SDominic Meiser if (*trifactors) { 3392e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 3393e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 3394e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 3395e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 33967f756511SDominic Meiser delete (*trifactors)->rpermIndices; 33977f756511SDominic Meiser delete (*trifactors)->cpermIndices; 33987f756511SDominic Meiser delete (*trifactors)->workVector; 33997e8381f9SStefano Zampini (*trifactors)->rpermIndices = NULL; 34007e8381f9SStefano Zampini (*trifactors)->cpermIndices = NULL; 34017e8381f9SStefano Zampini (*trifactors)->workVector = NULL; 3402ccdfe979SStefano Zampini } 3403ccdfe979SStefano Zampini PetscFunctionReturn(0); 3404ccdfe979SStefano Zampini } 3405ccdfe979SStefano Zampini 3406ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 3407ccdfe979SStefano Zampini { 3408e6e9a74fSStefano Zampini PetscErrorCode ierr; 3409ccdfe979SStefano Zampini cusparseHandle_t handle; 3410ccdfe979SStefano Zampini cusparseStatus_t stat; 3411ccdfe979SStefano Zampini 3412ccdfe979SStefano Zampini PetscFunctionBegin; 3413ccdfe979SStefano Zampini if (*trifactors) { 3414e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 34157f756511SDominic Meiser if (handle = (*trifactors)->handle) { 341657d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 34177f756511SDominic Meiser } 3418e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 34197f756511SDominic Meiser } 34207f756511SDominic Meiser PetscFunctionReturn(0); 34217f756511SDominic Meiser } 34227e8381f9SStefano Zampini 34237e8381f9SStefano Zampini struct IJCompare 34247e8381f9SStefano Zampini { 34257e8381f9SStefano Zampini __host__ __device__ 34267e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 34277e8381f9SStefano Zampini { 34287e8381f9SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 34297e8381f9SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 34307e8381f9SStefano Zampini return false; 34317e8381f9SStefano Zampini } 34327e8381f9SStefano Zampini }; 34337e8381f9SStefano Zampini 34347e8381f9SStefano Zampini struct IJEqual 34357e8381f9SStefano Zampini { 34367e8381f9SStefano Zampini __host__ __device__ 34377e8381f9SStefano Zampini inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 34387e8381f9SStefano Zampini { 34397e8381f9SStefano Zampini if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false; 34407e8381f9SStefano Zampini return true; 34417e8381f9SStefano Zampini } 34427e8381f9SStefano Zampini }; 34437e8381f9SStefano Zampini 34447e8381f9SStefano Zampini struct IJDiff 34457e8381f9SStefano Zampini { 34467e8381f9SStefano Zampini __host__ __device__ 34477e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 34487e8381f9SStefano Zampini { 34497e8381f9SStefano Zampini return t1 == t2 ? 0 : 1; 34507e8381f9SStefano Zampini } 34517e8381f9SStefano Zampini }; 34527e8381f9SStefano Zampini 34537e8381f9SStefano Zampini struct IJSum 34547e8381f9SStefano Zampini { 34557e8381f9SStefano Zampini __host__ __device__ 34567e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2) 34577e8381f9SStefano Zampini { 34587e8381f9SStefano Zampini return t1||t2; 34597e8381f9SStefano Zampini } 34607e8381f9SStefano Zampini }; 34617e8381f9SStefano Zampini 34627e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h> 3463e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 34647e8381f9SStefano Zampini { 34657e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3466fcdce8c4SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3467e61fc153SStefano Zampini THRUSTARRAY *cooPerm_v = NULL,*cooPerm_w; 346808391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 34697e8381f9SStefano Zampini CsrMatrix *matrix; 34707e8381f9SStefano Zampini PetscErrorCode ierr; 34717e8381f9SStefano Zampini cudaError_t cerr; 34727e8381f9SStefano Zampini PetscInt n; 34737e8381f9SStefano Zampini 34747e8381f9SStefano Zampini PetscFunctionBegin; 34757e8381f9SStefano Zampini if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct"); 34767e8381f9SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix"); 34777e8381f9SStefano Zampini if (!cusp->cooPerm) { 34787e8381f9SStefano Zampini ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 34797e8381f9SStefano Zampini ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 34807e8381f9SStefano Zampini PetscFunctionReturn(0); 34817e8381f9SStefano Zampini } 34827e8381f9SStefano Zampini matrix = (CsrMatrix*)cusp->mat->mat; 34837e8381f9SStefano Zampini if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 34847e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3485e61fc153SStefano Zampini if (!v) { 3486e61fc153SStefano Zampini if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.); 3487e61fc153SStefano Zampini goto finalize; 34887e8381f9SStefano Zampini } 3489e61fc153SStefano Zampini n = cusp->cooPerm->size(); 349008391a17SStefano Zampini if (isCudaMem(v)) { 349108391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 349208391a17SStefano Zampini } else { 3493e61fc153SStefano Zampini cooPerm_v = new THRUSTARRAY(n); 3494e61fc153SStefano Zampini cooPerm_v->assign(v,v+n); 349508391a17SStefano Zampini d_v = cooPerm_v->data(); 3496e61fc153SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 349708391a17SStefano Zampini } 3498e61fc153SStefano Zampini if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */ 34997e8381f9SStefano Zampini if (cusp->cooPerm_a) { 3500e61fc153SStefano Zampini cooPerm_w = new THRUSTARRAY(matrix->values->size()); 350108391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3502e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 3503e61fc153SStefano Zampini thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>()); 3504e61fc153SStefano Zampini delete cooPerm_w; 35057e8381f9SStefano Zampini } else { 350608391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 35077e8381f9SStefano Zampini matrix->values->begin())); 350808391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 35097e8381f9SStefano Zampini matrix->values->end())); 35107e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); 35117e8381f9SStefano Zampini } 35127e8381f9SStefano Zampini } else { 3513e61fc153SStefano Zampini if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */ 351408391a17SStefano Zampini auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()); 3515e61fc153SStefano Zampini thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>()); 35167e8381f9SStefano Zampini } else { 351708391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()), 35187e8381f9SStefano Zampini matrix->values->begin())); 351908391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()), 35207e8381f9SStefano Zampini matrix->values->end())); 35217e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 35227e8381f9SStefano Zampini } 35237e8381f9SStefano Zampini } 35247e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 35257e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 3526e61fc153SStefano Zampini finalize: 3527e61fc153SStefano Zampini delete cooPerm_v; 35287e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3529e61fc153SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3530fcdce8c4SStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 3531fcdce8c4SStefano Zampini ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr); 3532fcdce8c4SStefano Zampini ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 3533fcdce8c4SStefano Zampini ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr); 3534fcdce8c4SStefano Zampini a->reallocs = 0; 3535fcdce8c4SStefano Zampini A->info.mallocs += 0; 3536fcdce8c4SStefano Zampini A->info.nz_unneeded = 0; 3537fcdce8c4SStefano Zampini A->assembled = A->was_assembled = PETSC_TRUE; 3538fcdce8c4SStefano Zampini A->num_ass++; 35397e8381f9SStefano Zampini /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 35407e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr); 35417e8381f9SStefano Zampini PetscFunctionReturn(0); 35427e8381f9SStefano Zampini } 35437e8381f9SStefano Zampini 35447e8381f9SStefano Zampini #include <thrust/binary_search.h> 3545e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 35467e8381f9SStefano Zampini { 35477e8381f9SStefano Zampini PetscErrorCode ierr; 35487e8381f9SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 35497e8381f9SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 35507e8381f9SStefano Zampini PetscInt cooPerm_n, nzr = 0; 35517e8381f9SStefano Zampini cudaError_t cerr; 35527e8381f9SStefano Zampini 35537e8381f9SStefano Zampini PetscFunctionBegin; 35547e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 35557e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr); 35567e8381f9SStefano Zampini ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr); 35577e8381f9SStefano Zampini cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0; 35587e8381f9SStefano Zampini if (n != cooPerm_n) { 35597e8381f9SStefano Zampini delete cusp->cooPerm; 35607e8381f9SStefano Zampini delete cusp->cooPerm_a; 35617e8381f9SStefano Zampini cusp->cooPerm = NULL; 35627e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 35637e8381f9SStefano Zampini } 35647e8381f9SStefano Zampini if (n) { 35657e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 35667e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 35677e8381f9SStefano Zampini THRUSTINTARRAY ii(A->rmap->n); 35687e8381f9SStefano Zampini 35697e8381f9SStefano Zampini if (!cusp->cooPerm) { cusp->cooPerm = new THRUSTINTARRAY(n); } 35707e8381f9SStefano Zampini if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); } 35717e8381f9SStefano Zampini 35727e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 35737e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 35747e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 35757e8381f9SStefano Zampini auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin())); 35767e8381f9SStefano Zampini auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end())); 35777e8381f9SStefano Zampini 357808391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 35797e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0); 35807e8381f9SStefano Zampini thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); 35817e8381f9SStefano Zampini *cusp->cooPerm_a = d_i; 35827e8381f9SStefano Zampini THRUSTINTARRAY w = d_j; 35837e8381f9SStefano Zampini 35847e8381f9SStefano Zampini auto nekey = thrust::unique(fkey, ekey, IJEqual()); 35857e8381f9SStefano Zampini if (nekey == ekey) { /* all entries are unique */ 35867e8381f9SStefano Zampini delete cusp->cooPerm_a; 35877e8381f9SStefano Zampini cusp->cooPerm_a = NULL; 35887e8381f9SStefano Zampini } else { /* I couldn't come up with a more elegant algorithm */ 35897e8381f9SStefano Zampini adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); 35907e8381f9SStefano Zampini adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff()); 35917e8381f9SStefano Zampini (*cusp->cooPerm_a)[0] = 0; 35927e8381f9SStefano Zampini w[0] = 0; 35937e8381f9SStefano Zampini thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); 35947e8381f9SStefano Zampini thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); 35957e8381f9SStefano Zampini } 35967e8381f9SStefano Zampini thrust::counting_iterator<PetscInt> search_begin(0); 35977e8381f9SStefano Zampini thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), 35987e8381f9SStefano Zampini search_begin, search_begin + A->rmap->n, 35997e8381f9SStefano Zampini ii.begin()); 360008391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 360108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 36027e8381f9SStefano Zampini 36037e8381f9SStefano Zampini ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr); 36047e8381f9SStefano Zampini a->singlemalloc = PETSC_FALSE; 36057e8381f9SStefano Zampini a->free_a = PETSC_TRUE; 36067e8381f9SStefano Zampini a->free_ij = PETSC_TRUE; 36077e8381f9SStefano Zampini ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr); 36087e8381f9SStefano Zampini a->i[0] = 0; 36097e8381f9SStefano Zampini cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 36107e8381f9SStefano Zampini a->nz = a->maxnz = a->i[A->rmap->n]; 3611fcdce8c4SStefano Zampini a->rmax = 0; 36127e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr); 36137e8381f9SStefano Zampini ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr); 36147e8381f9SStefano Zampini cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 36157e8381f9SStefano Zampini if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); } 36167e8381f9SStefano Zampini if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); } 36177e8381f9SStefano Zampini for (PetscInt i = 0; i < A->rmap->n; i++) { 36187e8381f9SStefano Zampini const PetscInt nnzr = a->i[i+1] - a->i[i]; 36197e8381f9SStefano Zampini nzr += (PetscInt)!!(nnzr); 36207e8381f9SStefano Zampini a->ilen[i] = a->imax[i] = nnzr; 3621fcdce8c4SStefano Zampini a->rmax = PetscMax(a->rmax,nnzr); 36227e8381f9SStefano Zampini } 3623fcdce8c4SStefano Zampini a->nonzerorowcnt = nzr; 36247e8381f9SStefano Zampini A->preallocated = PETSC_TRUE; 36257e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr); 3626fcdce8c4SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr); 36277e8381f9SStefano Zampini } else { 36287e8381f9SStefano Zampini ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr); 36297e8381f9SStefano Zampini } 36307e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,A,0,0,0);CHKERRQ(ierr); 3631e61fc153SStefano Zampini ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr); 36327e8381f9SStefano Zampini 36337e8381f9SStefano Zampini /* We want to allocate the CUSPARSE struct for matvec now. 3634e61fc153SStefano Zampini The code is so convoluted now that I prefer to copy zeros */ 3635e61fc153SStefano Zampini ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr); 36367e8381f9SStefano Zampini ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr); 36377e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_CPU; 36387e8381f9SStefano Zampini A->nonzerostate++; 36397e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 36407e8381f9SStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 36417e8381f9SStefano Zampini 36427e8381f9SStefano Zampini A->assembled = PETSC_FALSE; 36437e8381f9SStefano Zampini A->was_assembled = PETSC_FALSE; 36447e8381f9SStefano Zampini PetscFunctionReturn(0); 36457e8381f9SStefano Zampini } 3646ed502f03SStefano Zampini 3647ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a) 3648ed502f03SStefano Zampini { 3649ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3650ed502f03SStefano Zampini CsrMatrix *csr; 3651ed502f03SStefano Zampini PetscErrorCode ierr; 3652ed502f03SStefano Zampini 3653ed502f03SStefano Zampini PetscFunctionBegin; 3654ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3655ed502f03SStefano Zampini PetscValidPointer(a,2); 3656ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3657ed502f03SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3658ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3659ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3660ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3661ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3662ed502f03SStefano Zampini *a = csr->values->data().get(); 3663ed502f03SStefano Zampini PetscFunctionReturn(0); 3664ed502f03SStefano Zampini } 3665ed502f03SStefano Zampini 3666ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a) 3667ed502f03SStefano Zampini { 3668ed502f03SStefano Zampini PetscFunctionBegin; 3669ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3670ed502f03SStefano Zampini PetscValidPointer(a,2); 3671ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3672ed502f03SStefano Zampini *a = NULL; 3673ed502f03SStefano Zampini PetscFunctionReturn(0); 3674ed502f03SStefano Zampini } 3675ed502f03SStefano Zampini 3676ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a) 3677ed502f03SStefano Zampini { 3678ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 3679ed502f03SStefano Zampini CsrMatrix *csr; 3680ed502f03SStefano Zampini 3681ed502f03SStefano Zampini PetscFunctionBegin; 3682ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3683ed502f03SStefano Zampini PetscValidPointer(a,2); 3684ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3685ed502f03SStefano Zampini if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3686ed502f03SStefano Zampini if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3687ed502f03SStefano Zampini csr = (CsrMatrix*)cusp->mat->mat; 3688ed502f03SStefano Zampini if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory"); 3689ed502f03SStefano Zampini *a = csr->values->data().get(); 3690ed502f03SStefano Zampini PetscFunctionReturn(0); 3691ed502f03SStefano Zampini } 3692ed502f03SStefano Zampini 3693ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a) 3694ed502f03SStefano Zampini { 3695ed502f03SStefano Zampini PetscErrorCode ierr; 3696ed502f03SStefano Zampini 3697ed502f03SStefano Zampini PetscFunctionBegin; 3698ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3699ed502f03SStefano Zampini PetscValidPointer(a,2); 3700ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3701ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 3702ed502f03SStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 3703ed502f03SStefano Zampini *a = NULL; 3704ed502f03SStefano Zampini PetscFunctionReturn(0); 3705ed502f03SStefano Zampini } 3706ed502f03SStefano Zampini 3707ed502f03SStefano Zampini struct IJCompare4 3708ed502f03SStefano Zampini { 3709ed502f03SStefano Zampini __host__ __device__ 3710ed502f03SStefano Zampini inline bool operator() (const thrust::tuple<int, int, PetscScalar, bool> &t1, const thrust::tuple<int, int, PetscScalar, bool> &t2) 3711ed502f03SStefano Zampini { 3712ed502f03SStefano Zampini if (t1.get<0>() < t2.get<0>()) return true; 3713ed502f03SStefano Zampini if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>(); 3714ed502f03SStefano Zampini return false; 3715ed502f03SStefano Zampini } 3716ed502f03SStefano Zampini }; 3717ed502f03SStefano Zampini 3718ed502f03SStefano Zampini struct Shift 3719ed502f03SStefano Zampini { 3720ed502f03SStefano Zampini int _shift; 3721ed502f03SStefano Zampini 3722ed502f03SStefano Zampini Shift(int shift) : _shift(shift) {} 3723ed502f03SStefano Zampini __host__ __device__ 3724ed502f03SStefano Zampini inline int operator() (const int &c) 3725ed502f03SStefano Zampini { 3726ed502f03SStefano Zampini return c + _shift; 3727ed502f03SStefano Zampini } 3728ed502f03SStefano Zampini }; 3729ed502f03SStefano Zampini 3730ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */ 3731ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 3732ed502f03SStefano Zampini { 3733ed502f03SStefano Zampini PetscErrorCode ierr; 3734ed502f03SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c; 3735ed502f03SStefano Zampini Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp; 3736ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *Cmat; 3737ed502f03SStefano Zampini CsrMatrix *Acsr,*Bcsr,*Ccsr; 3738ed502f03SStefano Zampini PetscInt Annz,Bnnz; 3739ed502f03SStefano Zampini cusparseStatus_t stat; 3740ed502f03SStefano Zampini PetscInt i,m,n,zero = 0; 3741ed502f03SStefano Zampini cudaError_t cerr; 3742ed502f03SStefano Zampini 3743ed502f03SStefano Zampini PetscFunctionBegin; 3744ed502f03SStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 3745ed502f03SStefano Zampini PetscValidHeaderSpecific(B,MAT_CLASSID,2); 3746ed502f03SStefano Zampini PetscValidPointer(C,4); 3747ed502f03SStefano Zampini PetscCheckTypeName(A,MATSEQAIJCUSPARSE); 3748ed502f03SStefano Zampini PetscCheckTypeName(B,MATSEQAIJCUSPARSE); 3749ed502f03SStefano Zampini if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 3750ed502f03SStefano Zampini if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 3751ed502f03SStefano Zampini if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3752ed502f03SStefano Zampini if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3753ed502f03SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 3754ed502f03SStefano Zampini m = A->rmap->n; 3755ed502f03SStefano Zampini n = A->cmap->n + B->cmap->n; 3756ed502f03SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr); 3757ed502f03SStefano Zampini ierr = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr); 3758ed502f03SStefano Zampini ierr = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3759ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 3760ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3761ed502f03SStefano Zampini Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3762ed502f03SStefano Zampini Ccsr = new CsrMatrix; 3763ed502f03SStefano Zampini Cmat->cprowIndices = NULL; 3764ed502f03SStefano Zampini c->compressedrow.use = PETSC_FALSE; 3765ed502f03SStefano Zampini c->compressedrow.nrows = 0; 3766ed502f03SStefano Zampini c->compressedrow.i = NULL; 3767ed502f03SStefano Zampini c->compressedrow.rindex = NULL; 3768ed502f03SStefano Zampini Ccusp->workVector = NULL; 3769ed502f03SStefano Zampini Ccusp->nrows = m; 3770ed502f03SStefano Zampini Ccusp->mat = Cmat; 3771ed502f03SStefano Zampini Ccusp->mat->mat = Ccsr; 3772ed502f03SStefano Zampini Ccsr->num_rows = m; 3773ed502f03SStefano Zampini Ccsr->num_cols = n; 3774ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat); 3775ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3776ed502f03SStefano Zampini stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3777ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3778ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3779ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3780ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3781ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3782ed502f03SStefano Zampini cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3783ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3784ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3785ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 3786ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr); 3787ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3788ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3789ed502f03SStefano Zampini 3790ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 3791ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3792ed502f03SStefano Zampini Annz = (PetscInt)Acsr->column_indices->size(); 3793ed502f03SStefano Zampini Bnnz = (PetscInt)Bcsr->column_indices->size(); 3794ed502f03SStefano Zampini c->nz = Annz + Bnnz; 3795ed502f03SStefano Zampini Ccsr->row_offsets = new THRUSTINTARRAY32(m+1); 3796ed502f03SStefano Zampini Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3797ed502f03SStefano Zampini Ccsr->values = new THRUSTARRAY(c->nz); 3798ed502f03SStefano Zampini Ccsr->num_entries = c->nz; 3799ed502f03SStefano Zampini Ccusp->cooPerm = new THRUSTINTARRAY(c->nz); 3800ed502f03SStefano Zampini 3801ed502f03SStefano Zampini if (c->nz) { 3802ed502f03SStefano Zampini THRUSTINTARRAY32 Acoo(Annz); 3803ed502f03SStefano Zampini THRUSTINTARRAY32 Bcoo(Bnnz); 3804ed502f03SStefano Zampini THRUSTINTARRAY32 *roff; 3805ed502f03SStefano Zampini if (a->compressedrow.use) { /* need full row offset */ 3806ed502f03SStefano Zampini if (!Acusp->rowoffsets_gpu) { 3807ed502f03SStefano Zampini Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 3808ed502f03SStefano Zampini Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1); 3809ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3810ed502f03SStefano Zampini } 3811ed502f03SStefano Zampini roff = Acusp->rowoffsets_gpu; 3812ed502f03SStefano Zampini } else roff = Acsr->row_offsets; 3813ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3814ed502f03SStefano Zampini stat = cusparseXcsr2coo(Acusp->handle, 3815ed502f03SStefano Zampini roff->data().get(), 3816ed502f03SStefano Zampini Annz, 3817ed502f03SStefano Zampini m, 3818ed502f03SStefano Zampini Acoo.data().get(), 3819ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3820ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3821ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3822ed502f03SStefano Zampini if (b->compressedrow.use) { /* need full row offset */ 3823ed502f03SStefano Zampini if (!Bcusp->rowoffsets_gpu) { 3824ed502f03SStefano Zampini Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3825ed502f03SStefano Zampini Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1); 3826ed502f03SStefano Zampini ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr); 3827ed502f03SStefano Zampini } 3828ed502f03SStefano Zampini roff = Bcusp->rowoffsets_gpu; 3829ed502f03SStefano Zampini } else roff = Bcsr->row_offsets; 3830ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3831ed502f03SStefano Zampini stat = cusparseXcsr2coo(Bcusp->handle, 3832ed502f03SStefano Zampini roff->data().get(), 3833ed502f03SStefano Zampini Bnnz, 3834ed502f03SStefano Zampini m, 3835ed502f03SStefano Zampini Bcoo.data().get(), 3836ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3837ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3838ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3839ed502f03SStefano Zampini THRUSTINTARRAY32 Ccoo(c->nz); 3840ed502f03SStefano Zampini auto Aperm = thrust::make_constant_iterator(true); 3841ed502f03SStefano Zampini auto Bperm = thrust::make_constant_iterator(false); 3842ed502f03SStefano Zampini auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n)); 3843ed502f03SStefano Zampini auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n)); 3844ed502f03SStefano Zampini thrust::device_vector<bool> wPerm(Annz+Bnnz); 3845ed502f03SStefano Zampini auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo.begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm)); 3846ed502f03SStefano Zampini auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo.end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm)); 3847ed502f03SStefano Zampini auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.begin(),Bcib,Bcsr->values->begin(),Bperm)); 3848ed502f03SStefano Zampini auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo.end(),Bcie,Bcsr->values->end(),Bperm)); 3849ed502f03SStefano Zampini auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo.begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm.begin())); 3850ed502f03SStefano Zampini auto p1 = Ccusp->cooPerm->begin(); 3851ed502f03SStefano Zampini auto p2 = Ccusp->cooPerm->begin(); 3852ed502f03SStefano Zampini thrust::advance(p2,Annz); 3853ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3854ed502f03SStefano Zampini thrust::merge(Azb,Aze,Bzb,Bze,Czb,IJCompare4()); 3855ed502f03SStefano Zampini thrust::partition_copy(thrust::make_counting_iterator(zero),thrust::make_counting_iterator(c->nz),wPerm.begin(),p1,p2,thrust::identity<bool>()); 3856ed502f03SStefano Zampini stat = cusparseXcoo2csr(Ccusp->handle, 3857ed502f03SStefano Zampini Ccoo.data().get(), 3858ed502f03SStefano Zampini c->nz, 3859ed502f03SStefano Zampini m, 3860ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), 3861ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3862ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3863ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3864ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3865ed502f03SStefano Zampini stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, 3866ed502f03SStefano Zampini Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), 3867ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3868ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3869ed502f03SStefano Zampini #endif 3870ed502f03SStefano Zampini if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */ 3871ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3872ed502f03SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 3873ed502f03SStefano Zampini CsrMatrix *CcsrT = new CsrMatrix; 3874ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3875ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 3876ed502f03SStefano Zampini 3877ed502f03SStefano Zampini Ccusp->transgen = PETSC_TRUE; 3878ed502f03SStefano Zampini CmatT->cprowIndices = NULL; 3879ed502f03SStefano Zampini CmatT->mat = CcsrT; 3880ed502f03SStefano Zampini CcsrT->num_rows = n; 3881ed502f03SStefano Zampini CcsrT->num_cols = m; 3882ed502f03SStefano Zampini CcsrT->num_entries = c->nz; 3883ed502f03SStefano Zampini 3884ed502f03SStefano Zampini CcsrT->row_offsets = new THRUSTINTARRAY32(n+1); 3885ed502f03SStefano Zampini CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 3886ed502f03SStefano Zampini CcsrT->values = new THRUSTARRAY(c->nz); 3887ed502f03SStefano Zampini 3888ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3889ed502f03SStefano Zampini auto rT = CcsrT->row_offsets->begin(); 3890ed502f03SStefano Zampini if (AT) { 3891ed502f03SStefano Zampini rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT); 3892ed502f03SStefano Zampini thrust::advance(rT,-1); 3893ed502f03SStefano Zampini } 3894ed502f03SStefano Zampini if (BT) { 3895ed502f03SStefano Zampini auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz)); 3896ed502f03SStefano Zampini auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz)); 3897ed502f03SStefano Zampini thrust::copy(titb,tite,rT); 3898ed502f03SStefano Zampini } 3899ed502f03SStefano Zampini auto cT = CcsrT->column_indices->begin(); 3900ed502f03SStefano Zampini if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT); 3901ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT); 3902ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 3903ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 3904ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 3905ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 3906ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3907ed502f03SStefano Zampini 3908ed502f03SStefano Zampini stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat); 3909ed502f03SStefano Zampini stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 3910ed502f03SStefano Zampini stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 3911ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr); 3912ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr); 3913ed502f03SStefano Zampini cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr); 3914ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3915ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3916ed502f03SStefano Zampini cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 3917ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 3918ed502f03SStefano Zampini stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, 3919ed502f03SStefano Zampini CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), 3920ed502f03SStefano Zampini CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 3921ed502f03SStefano Zampini CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat); 3922ed502f03SStefano Zampini #endif 3923ed502f03SStefano Zampini Ccusp->matTranspose = CmatT; 3924ed502f03SStefano Zampini } 3925ed502f03SStefano Zampini } 3926ed502f03SStefano Zampini 3927ed502f03SStefano Zampini c->singlemalloc = PETSC_FALSE; 3928ed502f03SStefano Zampini c->free_a = PETSC_TRUE; 3929ed502f03SStefano Zampini c->free_ij = PETSC_TRUE; 3930ed502f03SStefano Zampini ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr); 3931ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr); 3932ed502f03SStefano Zampini if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */ 3933ed502f03SStefano Zampini THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3934ed502f03SStefano Zampini THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3935ed502f03SStefano Zampini ii = *Ccsr->row_offsets; 3936ed502f03SStefano Zampini jj = *Ccsr->column_indices; 3937ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3938ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3939ed502f03SStefano Zampini } else { 3940ed502f03SStefano Zampini cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3941ed502f03SStefano Zampini cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 3942ed502f03SStefano Zampini } 3943ed502f03SStefano Zampini ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr); 3944ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr); 3945ed502f03SStefano Zampini ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr); 3946ed502f03SStefano Zampini c->maxnz = c->nz; 3947ed502f03SStefano Zampini c->nonzerorowcnt = 0; 3948ed502f03SStefano Zampini c->rmax = 0; 3949ed502f03SStefano Zampini for (i = 0; i < m; i++) { 3950ed502f03SStefano Zampini const PetscInt nn = c->i[i+1] - c->i[i]; 3951ed502f03SStefano Zampini c->ilen[i] = c->imax[i] = nn; 3952ed502f03SStefano Zampini c->nonzerorowcnt += (PetscInt)!!nn; 3953ed502f03SStefano Zampini c->rmax = PetscMax(c->rmax,nn); 3954ed502f03SStefano Zampini } 3955ed502f03SStefano Zampini ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr); 3956ed502f03SStefano Zampini ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr); 3957ed502f03SStefano Zampini (*C)->nonzerostate++; 3958ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr); 3959ed502f03SStefano Zampini ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr); 3960ed502f03SStefano Zampini Ccusp->nonzerostate = (*C)->nonzerostate; 3961ed502f03SStefano Zampini (*C)->preallocated = PETSC_TRUE; 3962ed502f03SStefano Zampini } else { 3963ed502f03SStefano Zampini if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n); 3964ed502f03SStefano Zampini c = (Mat_SeqAIJ*)(*C)->data; 3965ed502f03SStefano Zampini if (c->nz) { 3966ed502f03SStefano Zampini Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr; 3967ed502f03SStefano Zampini if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm"); 3968ed502f03SStefano Zampini if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented"); 3969ed502f03SStefano Zampini if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate"); 3970ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 3971ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr); 3972ed502f03SStefano Zampini if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3973ed502f03SStefano Zampini if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 3974ed502f03SStefano Zampini Acsr = (CsrMatrix*)Acusp->mat->mat; 3975ed502f03SStefano Zampini Bcsr = (CsrMatrix*)Bcusp->mat->mat; 3976ed502f03SStefano Zampini Ccsr = (CsrMatrix*)Ccusp->mat->mat; 3977ed502f03SStefano Zampini if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size()); 3978ed502f03SStefano Zampini if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size()); 3979ed502f03SStefano Zampini if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size()); 3980ed502f03SStefano Zampini if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries); 3981ed502f03SStefano Zampini if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size()); 3982ed502f03SStefano Zampini auto pmid = Ccusp->cooPerm->begin(); 3983ed502f03SStefano Zampini thrust::advance(pmid,Acsr->num_entries); 3984ed502f03SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3985ed502f03SStefano Zampini auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), 3986ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin()))); 3987ed502f03SStefano Zampini auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), 3988ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3989ed502f03SStefano Zampini thrust::for_each(zibait,zieait,VecCUDAEquals()); 3990ed502f03SStefano Zampini auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), 3991ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),pmid))); 3992ed502f03SStefano Zampini auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), 3993ed502f03SStefano Zampini thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end()))); 3994ed502f03SStefano Zampini thrust::for_each(zibbit,ziebit,VecCUDAEquals()); 3995ed502f03SStefano Zampini if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) { 3996ed502f03SStefano Zampini if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 3997ed502f03SStefano Zampini PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 3998ed502f03SStefano Zampini CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL; 3999ed502f03SStefano Zampini CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL; 4000ed502f03SStefano Zampini CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat; 4001ed502f03SStefano Zampini auto vT = CcsrT->values->begin(); 4002ed502f03SStefano Zampini if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT); 4003ed502f03SStefano Zampini if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT); 4004ed502f03SStefano Zampini } 4005ed502f03SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 4006ed502f03SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 4007ed502f03SStefano Zampini } 4008ed502f03SStefano Zampini } 4009ed502f03SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr); 4010ed502f03SStefano Zampini (*C)->assembled = PETSC_TRUE; 4011ed502f03SStefano Zampini (*C)->was_assembled = PETSC_FALSE; 4012ed502f03SStefano Zampini (*C)->offloadmask = PETSC_OFFLOAD_GPU; 4013ed502f03SStefano Zampini /* we can remove this call when MatSeqAIJGetArray operations are used everywhere! */ 4014ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSECopyFromGPU(*C);CHKERRQ(ierr); 4015ed502f03SStefano Zampini PetscFunctionReturn(0); 4016ed502f03SStefano Zampini } 4017