19ae82921SPaul Mullowney /* 29ae82921SPaul Mullowney Defines the basic matrix operations for the AIJ (compressed row) 3fd7c363cSSatish Balay matrix storage format using the CUSPARSE library, 49ae82921SPaul Mullowney */ 5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 89ae82921SPaul Mullowney 93d13b8fdSMatthew G. Knepley #include <petscconf.h> 103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h> 123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h> 13af0996ceSBarry Smith #include <petsc/private/vecimpl.h> 149ae82921SPaul Mullowney #undef VecType 153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 16bc3f50f2SPaul Mullowney 17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[] = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0}; 18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 19afb2bd1cSJunchao Zhang /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 20afb2bd1cSJunchao Zhang 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 21afb2bd1cSJunchao Zhang 22afb2bd1cSJunchao Zhang typedef enum { 23afb2bd1cSJunchao Zhang CUSPARSE_MV_ALG_DEFAULT = 0, 24afb2bd1cSJunchao Zhang CUSPARSE_COOMV_ALG = 1, 25afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG1 = 2, 26afb2bd1cSJunchao Zhang CUSPARSE_CSRMV_ALG2 = 3 27afb2bd1cSJunchao Zhang } cusparseSpMVAlg_t; 28afb2bd1cSJunchao Zhang 29afb2bd1cSJunchao Zhang typedef enum { 30afb2bd1cSJunchao Zhang CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 31afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 32afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 33afb2bd1cSJunchao Zhang CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 34afb2bd1cSJunchao Zhang CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 35afb2bd1cSJunchao Zhang CUSPARSE_SPMM_ALG_DEFAULT = 0, 36afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG1 = 1, 37afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG2 = 2, 38afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG3 = 3, 39afb2bd1cSJunchao Zhang CUSPARSE_SPMM_COO_ALG4 = 5, 40afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG1 = 4, 41afb2bd1cSJunchao Zhang CUSPARSE_SPMM_CSR_ALG2 = 6, 42afb2bd1cSJunchao Zhang } cusparseSpMMAlg_t; 43afb2bd1cSJunchao Zhang 44afb2bd1cSJunchao Zhang typedef enum { 45afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 46afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 47afb2bd1cSJunchao Zhang } cusparseCsr2CscAlg_t; 48afb2bd1cSJunchao Zhang */ 49afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0}; 50afb2bd1cSJunchao Zhang const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0}; 51afb2bd1cSJunchao Zhang const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0}; 52afb2bd1cSJunchao Zhang #endif 539ae82921SPaul Mullowney 54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*); 56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 57087f3262SPaul Mullowney 586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*); 606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*); 61087f3262SPaul Mullowney 626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec); 636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec); 664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat); 676fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec); 686fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 696fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 706fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 71e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec); 72e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec); 73e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool); 749ae82921SPaul Mullowney 757f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**); 76470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**); 77470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat); 78ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**); 79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**); 80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**); 817f756511SDominic Meiser 82b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream) 83b06137fdSPaul Mullowney { 84b06137fdSPaul Mullowney cusparseStatus_t stat; 85b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 86b06137fdSPaul Mullowney 87b06137fdSPaul Mullowney PetscFunctionBegin; 88b06137fdSPaul Mullowney cusparsestruct->stream = stream; 8957d48284SJunchao Zhang stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat); 90b06137fdSPaul Mullowney PetscFunctionReturn(0); 91b06137fdSPaul Mullowney } 92b06137fdSPaul Mullowney 93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle) 94b06137fdSPaul Mullowney { 95b06137fdSPaul Mullowney cusparseStatus_t stat; 96b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 97b06137fdSPaul Mullowney 98b06137fdSPaul Mullowney PetscFunctionBegin; 996b1cf21dSAlejandro Lamas Daviña if (cusparsestruct->handle != handle) { 10016a2e217SAlejandro Lamas Daviña if (cusparsestruct->handle) { 10157d48284SJunchao Zhang stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat); 10216a2e217SAlejandro Lamas Daviña } 103b06137fdSPaul Mullowney cusparsestruct->handle = handle; 1046b1cf21dSAlejandro Lamas Daviña } 10557d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 106b06137fdSPaul Mullowney PetscFunctionReturn(0); 107b06137fdSPaul Mullowney } 108b06137fdSPaul Mullowney 109b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A) 110b06137fdSPaul Mullowney { 111b06137fdSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 112ccdfe979SStefano Zampini 113b06137fdSPaul Mullowney PetscFunctionBegin; 114ccdfe979SStefano Zampini if (cusparsestruct->handle) cusparsestruct->handle = 0; 115b06137fdSPaul Mullowney PetscFunctionReturn(0); 116b06137fdSPaul Mullowney } 117b06137fdSPaul Mullowney 118ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type) 1199ae82921SPaul Mullowney { 1209ae82921SPaul Mullowney PetscFunctionBegin; 1219ae82921SPaul Mullowney *type = MATSOLVERCUSPARSE; 1229ae82921SPaul Mullowney PetscFunctionReturn(0); 1239ae82921SPaul Mullowney } 1249ae82921SPaul Mullowney 125c708e6cdSJed Brown /*MC 126087f3262SPaul Mullowney MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 127087f3262SPaul Mullowney on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported 128087f3262SPaul Mullowney algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 129087f3262SPaul Mullowney performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 130087f3262SPaul Mullowney CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 131087f3262SPaul Mullowney algorithms are not recommended. This class does NOT support direct solver operations. 132c708e6cdSJed Brown 1339ae82921SPaul Mullowney Level: beginner 134c708e6cdSJed Brown 1353ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 136c708e6cdSJed Brown M*/ 1379ae82921SPaul Mullowney 13842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B) 1399ae82921SPaul Mullowney { 1409ae82921SPaul Mullowney PetscErrorCode ierr; 141bc3f50f2SPaul Mullowney PetscInt n = A->rmap->n; 1429ae82921SPaul Mullowney 1439ae82921SPaul Mullowney PetscFunctionBegin; 144bc3f50f2SPaul Mullowney ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 145bc3f50f2SPaul Mullowney ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1462c7c0729SBarry Smith (*B)->factortype = ftype; 1472c7c0729SBarry Smith (*B)->useordering = PETSC_TRUE; 1489ae82921SPaul Mullowney ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1492205254eSKarl Rupp 150087f3262SPaul Mullowney if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 15133d57670SJed Brown ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 1529ae82921SPaul Mullowney (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 1539ae82921SPaul Mullowney (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 154087f3262SPaul Mullowney } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 155087f3262SPaul Mullowney (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 156087f3262SPaul Mullowney (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 1579ae82921SPaul Mullowney } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types"); 158bc3f50f2SPaul Mullowney 159fa03d054SJed Brown ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1603ca39a21SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr); 1619ae82921SPaul Mullowney PetscFunctionReturn(0); 1629ae82921SPaul Mullowney } 1639ae82921SPaul Mullowney 164bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 165ca45077fSPaul Mullowney { 166aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1676e111a19SKarl Rupp 168ca45077fSPaul Mullowney PetscFunctionBegin; 169ca45077fSPaul Mullowney switch (op) { 170e057df02SPaul Mullowney case MAT_CUSPARSE_MULT: 171aa372e3fSPaul Mullowney cusparsestruct->format = format; 172ca45077fSPaul Mullowney break; 173e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 174aa372e3fSPaul Mullowney cusparsestruct->format = format; 175ca45077fSPaul Mullowney break; 176ca45077fSPaul Mullowney default: 17736d62e41SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op); 178ca45077fSPaul Mullowney } 179ca45077fSPaul Mullowney PetscFunctionReturn(0); 180ca45077fSPaul Mullowney } 1819ae82921SPaul Mullowney 182e057df02SPaul Mullowney /*@ 183e057df02SPaul Mullowney MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular 184e057df02SPaul Mullowney operation. Only the MatMult operation can use different GPU storage formats 185aa372e3fSPaul Mullowney for MPIAIJCUSPARSE matrices. 186e057df02SPaul Mullowney Not Collective 187e057df02SPaul Mullowney 188e057df02SPaul Mullowney Input Parameters: 1898468deeeSKarl Rupp + A - Matrix of type SEQAIJCUSPARSE 19036d62e41SPaul Mullowney . op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL. 1912692e278SPaul Mullowney - format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2) 192e057df02SPaul Mullowney 193e057df02SPaul Mullowney Output Parameter: 194e057df02SPaul Mullowney 195e057df02SPaul Mullowney Level: intermediate 196e057df02SPaul Mullowney 1978468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 198e057df02SPaul Mullowney @*/ 199e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 200e057df02SPaul Mullowney { 201e057df02SPaul Mullowney PetscErrorCode ierr; 2026e111a19SKarl Rupp 203e057df02SPaul Mullowney PetscFunctionBegin; 204e057df02SPaul Mullowney PetscValidHeaderSpecific(A, MAT_CLASSID,1); 205e057df02SPaul Mullowney ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr); 206e057df02SPaul Mullowney PetscFunctionReturn(0); 207e057df02SPaul Mullowney } 208e057df02SPaul Mullowney 209e6e9a74fSStefano Zampini /*@ 210e6e9a74fSStefano Zampini MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the tranpose matrix before calling MatMultTranspose 211e6e9a74fSStefano Zampini 212e6e9a74fSStefano Zampini Collective on mat 213e6e9a74fSStefano Zampini 214e6e9a74fSStefano Zampini Input Parameters: 215e6e9a74fSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE 216e6e9a74fSStefano Zampini - transgen - the boolean flag 217e6e9a74fSStefano Zampini 218e6e9a74fSStefano Zampini Level: intermediate 219e6e9a74fSStefano Zampini 220e6e9a74fSStefano Zampini .seealso: MATSEQAIJCUSPARSE 221e6e9a74fSStefano Zampini @*/ 222e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen) 223e6e9a74fSStefano Zampini { 224e6e9a74fSStefano Zampini PetscErrorCode ierr; 225e6e9a74fSStefano Zampini PetscBool flg; 226e6e9a74fSStefano Zampini 227e6e9a74fSStefano Zampini PetscFunctionBegin; 228e6e9a74fSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 229e6e9a74fSStefano Zampini ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 230e6e9a74fSStefano Zampini if (flg) { 231e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 23254da937aSStefano Zampini 233e6e9a74fSStefano Zampini if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix"); 234e6e9a74fSStefano Zampini cusp->transgen = transgen; 23554da937aSStefano Zampini if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */ 23654da937aSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr); 23754da937aSStefano Zampini } 238e6e9a74fSStefano Zampini } 239e6e9a74fSStefano Zampini PetscFunctionReturn(0); 240e6e9a74fSStefano Zampini } 241e6e9a74fSStefano Zampini 2424416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 2439ae82921SPaul Mullowney { 2449ae82921SPaul Mullowney PetscErrorCode ierr; 245e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 2469ae82921SPaul Mullowney PetscBool flg; 247a183c035SDominic Meiser Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 2486e111a19SKarl Rupp 2499ae82921SPaul Mullowney PetscFunctionBegin; 250e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr); 2519ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 25254da937aSStefano Zampini PetscBool transgen = cusparsestruct->transgen; 25354da937aSStefano Zampini 25454da937aSStefano Zampini ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr); 255afb2bd1cSJunchao Zhang if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);} 256afb2bd1cSJunchao Zhang 257e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV", 258a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 259afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);} 260afb2bd1cSJunchao Zhang 2614c87dfd4SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", 262a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 263afb2bd1cSJunchao Zhang if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);} 264afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 265afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 266afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", 267afb2bd1cSJunchao Zhang "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr); 268afb2bd1cSJunchao Zhang /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 269afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 270afb2bd1cSJunchao Zhang 271afb2bd1cSJunchao Zhang cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 272afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", 273afb2bd1cSJunchao Zhang "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr); 274afb2bd1cSJunchao Zhang if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 275afb2bd1cSJunchao Zhang 276afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 277afb2bd1cSJunchao Zhang ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", 278afb2bd1cSJunchao Zhang "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr); 279afb2bd1cSJunchao Zhang if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 280afb2bd1cSJunchao Zhang #endif 2814c87dfd4SPaul Mullowney } 2820af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 2839ae82921SPaul Mullowney PetscFunctionReturn(0); 2849ae82921SPaul Mullowney } 2859ae82921SPaul Mullowney 2866fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2879ae82921SPaul Mullowney { 2889ae82921SPaul Mullowney PetscErrorCode ierr; 2899ae82921SPaul Mullowney 2909ae82921SPaul Mullowney PetscFunctionBegin; 2919ae82921SPaul Mullowney ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 2929ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2939ae82921SPaul Mullowney PetscFunctionReturn(0); 2949ae82921SPaul Mullowney } 2959ae82921SPaul Mullowney 2966fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 2979ae82921SPaul Mullowney { 2989ae82921SPaul Mullowney PetscErrorCode ierr; 2999ae82921SPaul Mullowney 3009ae82921SPaul Mullowney PetscFunctionBegin; 3019ae82921SPaul Mullowney ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 3029ae82921SPaul Mullowney B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 3039ae82921SPaul Mullowney PetscFunctionReturn(0); 3049ae82921SPaul Mullowney } 3059ae82921SPaul Mullowney 306087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 307087f3262SPaul Mullowney { 308087f3262SPaul Mullowney PetscErrorCode ierr; 309087f3262SPaul Mullowney 310087f3262SPaul Mullowney PetscFunctionBegin; 311087f3262SPaul Mullowney ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 312087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 313087f3262SPaul Mullowney PetscFunctionReturn(0); 314087f3262SPaul Mullowney } 315087f3262SPaul Mullowney 316087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info) 317087f3262SPaul Mullowney { 318087f3262SPaul Mullowney PetscErrorCode ierr; 319087f3262SPaul Mullowney 320087f3262SPaul Mullowney PetscFunctionBegin; 321087f3262SPaul Mullowney ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr); 322087f3262SPaul Mullowney B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 323087f3262SPaul Mullowney PetscFunctionReturn(0); 324087f3262SPaul Mullowney } 325087f3262SPaul Mullowney 326087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 3279ae82921SPaul Mullowney { 3289ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3299ae82921SPaul Mullowney PetscInt n = A->rmap->n; 3309ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 331aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 3329ae82921SPaul Mullowney cusparseStatus_t stat; 3339ae82921SPaul Mullowney const PetscInt *ai = a->i,*aj = a->j,*vi; 3349ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 3359ae82921SPaul Mullowney PetscInt *AiLo, *AjLo; 3369ae82921SPaul Mullowney PetscScalar *AALo; 3379ae82921SPaul Mullowney PetscInt i,nz, nzLower, offset, rowOffset; 338b175d8bbSPaul Mullowney PetscErrorCode ierr; 33957d48284SJunchao Zhang cudaError_t cerr; 3409ae82921SPaul Mullowney 3419ae82921SPaul Mullowney PetscFunctionBegin; 342cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 343c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 3449ae82921SPaul Mullowney try { 3459ae82921SPaul Mullowney /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 3469ae82921SPaul Mullowney nzLower=n+ai[n]-ai[1]; 3479ae82921SPaul Mullowney 3489ae82921SPaul Mullowney /* Allocate Space for the lower triangular matrix */ 34957d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 35057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr); 35157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr); 3529ae82921SPaul Mullowney 3539ae82921SPaul Mullowney /* Fill the lower triangular matrix */ 3549ae82921SPaul Mullowney AiLo[0] = (PetscInt) 0; 3559ae82921SPaul Mullowney AiLo[n] = nzLower; 3569ae82921SPaul Mullowney AjLo[0] = (PetscInt) 0; 3579ae82921SPaul Mullowney AALo[0] = (MatScalar) 1.0; 3589ae82921SPaul Mullowney v = aa; 3599ae82921SPaul Mullowney vi = aj; 3609ae82921SPaul Mullowney offset = 1; 3619ae82921SPaul Mullowney rowOffset= 1; 3629ae82921SPaul Mullowney for (i=1; i<n; i++) { 3639ae82921SPaul Mullowney nz = ai[i+1] - ai[i]; 364e057df02SPaul Mullowney /* additional 1 for the term on the diagonal */ 3659ae82921SPaul Mullowney AiLo[i] = rowOffset; 3669ae82921SPaul Mullowney rowOffset += nz+1; 3679ae82921SPaul Mullowney 368580bdb30SBarry Smith ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr); 369580bdb30SBarry Smith ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr); 3709ae82921SPaul Mullowney 3719ae82921SPaul Mullowney offset += nz; 3729ae82921SPaul Mullowney AjLo[offset] = (PetscInt) i; 3739ae82921SPaul Mullowney AALo[offset] = (MatScalar) 1.0; 3749ae82921SPaul Mullowney offset += 1; 3759ae82921SPaul Mullowney 3769ae82921SPaul Mullowney v += nz; 3779ae82921SPaul Mullowney vi += nz; 3789ae82921SPaul Mullowney } 3792205254eSKarl Rupp 380aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 381aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 3822205254eSKarl Rupp 383aa372e3fSPaul Mullowney /* Create the matrix description */ 38457d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 38557d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 386afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 387afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 388afb2bd1cSJunchao Zhang #else 38957d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 390afb2bd1cSJunchao Zhang #endif 39157d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat); 39257d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 393aa372e3fSPaul Mullowney 394aa372e3fSPaul Mullowney /* set the operation */ 395aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 396aa372e3fSPaul Mullowney 397aa372e3fSPaul Mullowney /* set the matrix */ 398aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 399aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = n; 400aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = n; 401aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = nzLower; 402aa372e3fSPaul Mullowney 403aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 404aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1); 405aa372e3fSPaul Mullowney 406aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 407aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower); 408aa372e3fSPaul Mullowney 409aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 410aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+nzLower); 411aa372e3fSPaul Mullowney 412afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 413afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 414afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 415afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 416afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 417afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 418afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 419afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 420afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 421afb2bd1cSJunchao Zhang #endif 422afb2bd1cSJunchao Zhang 423aa372e3fSPaul Mullowney /* perform the solve analysis */ 424aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 425aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 426aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 427afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 428afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 429afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 430afb2bd1cSJunchao Zhang #endif 431afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 432aa372e3fSPaul Mullowney 433aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 434aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 4352205254eSKarl Rupp 43657d48284SJunchao Zhang cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr); 43757d48284SJunchao Zhang cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr); 43857d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 4394863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr); 4409ae82921SPaul Mullowney } catch(char *ex) { 4419ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 4429ae82921SPaul Mullowney } 4439ae82921SPaul Mullowney } 4449ae82921SPaul Mullowney PetscFunctionReturn(0); 4459ae82921SPaul Mullowney } 4469ae82921SPaul Mullowney 447087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 4489ae82921SPaul Mullowney { 4499ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4509ae82921SPaul Mullowney PetscInt n = A->rmap->n; 4519ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 452aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 4539ae82921SPaul Mullowney cusparseStatus_t stat; 4549ae82921SPaul Mullowney const PetscInt *aj = a->j,*adiag = a->diag,*vi; 4559ae82921SPaul Mullowney const MatScalar *aa = a->a,*v; 4569ae82921SPaul Mullowney PetscInt *AiUp, *AjUp; 4579ae82921SPaul Mullowney PetscScalar *AAUp; 4589ae82921SPaul Mullowney PetscInt i,nz, nzUpper, offset; 4599ae82921SPaul Mullowney PetscErrorCode ierr; 46057d48284SJunchao Zhang cudaError_t cerr; 4619ae82921SPaul Mullowney 4629ae82921SPaul Mullowney PetscFunctionBegin; 463cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 464c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 4659ae82921SPaul Mullowney try { 4669ae82921SPaul Mullowney /* next, figure out the number of nonzeros in the upper triangular matrix. */ 4679ae82921SPaul Mullowney nzUpper = adiag[0]-adiag[n]; 4689ae82921SPaul Mullowney 4699ae82921SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 47057d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 47157d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 47257d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 4739ae82921SPaul Mullowney 4749ae82921SPaul Mullowney /* Fill the upper triangular matrix */ 4759ae82921SPaul Mullowney AiUp[0]=(PetscInt) 0; 4769ae82921SPaul Mullowney AiUp[n]=nzUpper; 4779ae82921SPaul Mullowney offset = nzUpper; 4789ae82921SPaul Mullowney for (i=n-1; i>=0; i--) { 4799ae82921SPaul Mullowney v = aa + adiag[i+1] + 1; 4809ae82921SPaul Mullowney vi = aj + adiag[i+1] + 1; 4819ae82921SPaul Mullowney 482e057df02SPaul Mullowney /* number of elements NOT on the diagonal */ 4839ae82921SPaul Mullowney nz = adiag[i] - adiag[i+1]-1; 4849ae82921SPaul Mullowney 485e057df02SPaul Mullowney /* decrement the offset */ 4869ae82921SPaul Mullowney offset -= (nz+1); 4879ae82921SPaul Mullowney 488e057df02SPaul Mullowney /* first, set the diagonal elements */ 4899ae82921SPaul Mullowney AjUp[offset] = (PetscInt) i; 49009f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1./v[nz]; 4919ae82921SPaul Mullowney AiUp[i] = AiUp[i+1] - (nz+1); 4929ae82921SPaul Mullowney 493580bdb30SBarry Smith ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr); 494580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr); 4959ae82921SPaul Mullowney } 4962205254eSKarl Rupp 497aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 498aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 4992205254eSKarl Rupp 500aa372e3fSPaul Mullowney /* Create the matrix description */ 50157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 50257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 503afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 504afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 505afb2bd1cSJunchao Zhang #else 50657d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 507afb2bd1cSJunchao Zhang #endif 50857d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 50957d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 510aa372e3fSPaul Mullowney 511aa372e3fSPaul Mullowney /* set the operation */ 512aa372e3fSPaul Mullowney upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 513aa372e3fSPaul Mullowney 514aa372e3fSPaul Mullowney /* set the matrix */ 515aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 516aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = n; 517aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = n; 518aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = nzUpper; 519aa372e3fSPaul Mullowney 520aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1); 521aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1); 522aa372e3fSPaul Mullowney 523aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 524aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper); 525aa372e3fSPaul Mullowney 526aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 527aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper); 528aa372e3fSPaul Mullowney 529afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 530afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 531afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 532afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 533afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 534afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 535afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 536afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 537afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 538afb2bd1cSJunchao Zhang #endif 539afb2bd1cSJunchao Zhang 540aa372e3fSPaul Mullowney /* perform the solve analysis */ 541aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 542aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 543aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 544afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 545afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 546afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 547afb2bd1cSJunchao Zhang #endif 548afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 549aa372e3fSPaul Mullowney 550aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 551aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 5522205254eSKarl Rupp 55357d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 55457d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 55557d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 5564863603aSSatish Balay ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr); 5579ae82921SPaul Mullowney } catch(char *ex) { 5589ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 5599ae82921SPaul Mullowney } 5609ae82921SPaul Mullowney } 5619ae82921SPaul Mullowney PetscFunctionReturn(0); 5629ae82921SPaul Mullowney } 5639ae82921SPaul Mullowney 564087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 5659ae82921SPaul Mullowney { 5669ae82921SPaul Mullowney PetscErrorCode ierr; 5679ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 5689ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 5699ae82921SPaul Mullowney IS isrow = a->row,iscol = a->icol; 5709ae82921SPaul Mullowney PetscBool row_identity,col_identity; 5719ae82921SPaul Mullowney const PetscInt *r,*c; 5729ae82921SPaul Mullowney PetscInt n = A->rmap->n; 5739ae82921SPaul Mullowney 5749ae82921SPaul Mullowney PetscFunctionBegin; 575ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 576087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr); 577087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr); 5782205254eSKarl Rupp 579e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 580aa372e3fSPaul Mullowney cusparseTriFactors->nnz=a->nz; 5819ae82921SPaul Mullowney 582c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 583e057df02SPaul Mullowney /* lower triangular indices */ 5849ae82921SPaul Mullowney ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5859ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5862205254eSKarl Rupp if (!row_identity) { 587aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 588aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(r, r+n); 5892205254eSKarl Rupp } 5909ae82921SPaul Mullowney ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5919ae82921SPaul Mullowney 592e057df02SPaul Mullowney /* upper triangular indices */ 5939ae82921SPaul Mullowney ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 5949ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5952205254eSKarl Rupp if (!col_identity) { 596aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 597aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices->assign(c, c+n); 5982205254eSKarl Rupp } 5994863603aSSatish Balay 6004863603aSSatish Balay if (!row_identity && !col_identity) { 6014863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 6024863603aSSatish Balay } else if(!row_identity) { 6034863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 6044863603aSSatish Balay } else if(!col_identity) { 6054863603aSSatish Balay ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr); 6064863603aSSatish Balay } 6074863603aSSatish Balay 6089ae82921SPaul Mullowney ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 6099ae82921SPaul Mullowney PetscFunctionReturn(0); 6109ae82921SPaul Mullowney } 6119ae82921SPaul Mullowney 612087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 613087f3262SPaul Mullowney { 614087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 615087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 616aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 617aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 618087f3262SPaul Mullowney cusparseStatus_t stat; 619087f3262SPaul Mullowney PetscErrorCode ierr; 62057d48284SJunchao Zhang cudaError_t cerr; 621087f3262SPaul Mullowney PetscInt *AiUp, *AjUp; 622087f3262SPaul Mullowney PetscScalar *AAUp; 623087f3262SPaul Mullowney PetscScalar *AALo; 624087f3262SPaul Mullowney PetscInt nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j; 625087f3262SPaul Mullowney Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ*)A->data; 626087f3262SPaul Mullowney const PetscInt *ai = b->i,*aj = b->j,*vj; 627087f3262SPaul Mullowney const MatScalar *aa = b->a,*v; 628087f3262SPaul Mullowney 629087f3262SPaul Mullowney PetscFunctionBegin; 630cf00fe3bSKarl Rupp if (!n) PetscFunctionReturn(0); 631c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 632087f3262SPaul Mullowney try { 633087f3262SPaul Mullowney /* Allocate Space for the upper triangular matrix */ 63457d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr); 63557d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr); 63657d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 63757d48284SJunchao Zhang cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr); 638087f3262SPaul Mullowney 639087f3262SPaul Mullowney /* Fill the upper triangular matrix */ 640087f3262SPaul Mullowney AiUp[0]=(PetscInt) 0; 641087f3262SPaul Mullowney AiUp[n]=nzUpper; 642087f3262SPaul Mullowney offset = 0; 643087f3262SPaul Mullowney for (i=0; i<n; i++) { 644087f3262SPaul Mullowney /* set the pointers */ 645087f3262SPaul Mullowney v = aa + ai[i]; 646087f3262SPaul Mullowney vj = aj + ai[i]; 647087f3262SPaul Mullowney nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 648087f3262SPaul Mullowney 649087f3262SPaul Mullowney /* first, set the diagonal elements */ 650087f3262SPaul Mullowney AjUp[offset] = (PetscInt) i; 65109f51544SAlejandro Lamas Daviña AAUp[offset] = (MatScalar)1.0/v[nz]; 652087f3262SPaul Mullowney AiUp[i] = offset; 65309f51544SAlejandro Lamas Daviña AALo[offset] = (MatScalar)1.0/v[nz]; 654087f3262SPaul Mullowney 655087f3262SPaul Mullowney offset+=1; 656087f3262SPaul Mullowney if (nz>0) { 657f22e0265SBarry Smith ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr); 658580bdb30SBarry Smith ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr); 659087f3262SPaul Mullowney for (j=offset; j<offset+nz; j++) { 660087f3262SPaul Mullowney AAUp[j] = -AAUp[j]; 661087f3262SPaul Mullowney AALo[j] = AAUp[j]/v[nz]; 662087f3262SPaul Mullowney } 663087f3262SPaul Mullowney offset+=nz; 664087f3262SPaul Mullowney } 665087f3262SPaul Mullowney } 666087f3262SPaul Mullowney 667aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 668aa372e3fSPaul Mullowney upTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 669087f3262SPaul Mullowney 670aa372e3fSPaul Mullowney /* Create the matrix description */ 67157d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat); 67257d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 673afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 674afb2bd1cSJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 675afb2bd1cSJunchao Zhang #else 67657d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 677afb2bd1cSJunchao Zhang #endif 67857d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 67957d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat); 680087f3262SPaul Mullowney 681aa372e3fSPaul Mullowney /* set the matrix */ 682aa372e3fSPaul Mullowney upTriFactor->csrMat = new CsrMatrix; 683aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows = A->rmap->n; 684aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols = A->cmap->n; 685aa372e3fSPaul Mullowney upTriFactor->csrMat->num_entries = a->nz; 686aa372e3fSPaul Mullowney 687aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 688aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 689aa372e3fSPaul Mullowney 690aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 691aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 692aa372e3fSPaul Mullowney 693aa372e3fSPaul Mullowney upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 694aa372e3fSPaul Mullowney upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz); 695aa372e3fSPaul Mullowney 696afb2bd1cSJunchao Zhang /* set the operation */ 697afb2bd1cSJunchao Zhang upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 698afb2bd1cSJunchao Zhang 699afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 700afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat); 701afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 702afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp, 703afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 704afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 705afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, 706afb2bd1cSJunchao Zhang &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 707afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr); 708afb2bd1cSJunchao Zhang #endif 709afb2bd1cSJunchao Zhang 710aa372e3fSPaul Mullowney /* perform the solve analysis */ 711aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, 712aa372e3fSPaul Mullowney upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, 713aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 714afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo 715afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 716afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 717afb2bd1cSJunchao Zhang #endif 718afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 719aa372e3fSPaul Mullowney 720aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 721aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor; 722aa372e3fSPaul Mullowney 723aa372e3fSPaul Mullowney /* allocate space for the triangular factor information */ 724aa372e3fSPaul Mullowney loTriFactor = new Mat_SeqAIJCUSPARSETriFactorStruct; 725aa372e3fSPaul Mullowney 726aa372e3fSPaul Mullowney /* Create the matrix description */ 72757d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat); 72857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 729afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 730afb2bd1cSJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 731afb2bd1cSJunchao Zhang #else 73257d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat); 733afb2bd1cSJunchao Zhang #endif 73457d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat); 73557d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat); 736aa372e3fSPaul Mullowney 737aa372e3fSPaul Mullowney /* set the operation */ 738aa372e3fSPaul Mullowney loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 739aa372e3fSPaul Mullowney 740aa372e3fSPaul Mullowney /* set the matrix */ 741aa372e3fSPaul Mullowney loTriFactor->csrMat = new CsrMatrix; 742aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows = A->rmap->n; 743aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols = A->cmap->n; 744aa372e3fSPaul Mullowney loTriFactor->csrMat->num_entries = a->nz; 745aa372e3fSPaul Mullowney 746aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 747aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1); 748aa372e3fSPaul Mullowney 749aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 750aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz); 751aa372e3fSPaul Mullowney 752aa372e3fSPaul Mullowney loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 753aa372e3fSPaul Mullowney loTriFactor->csrMat->values->assign(AALo, AALo+a->nz); 7544863603aSSatish Balay ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr); 755aa372e3fSPaul Mullowney 756afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 757afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat); 758afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 759afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp, 760afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 761afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 762afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, 763afb2bd1cSJunchao Zhang &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat); 764afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr); 765afb2bd1cSJunchao Zhang #endif 766afb2bd1cSJunchao Zhang 767aa372e3fSPaul Mullowney /* perform the solve analysis */ 768aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, 769aa372e3fSPaul Mullowney loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, 770aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 771afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo 772afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 773afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 774afb2bd1cSJunchao Zhang #endif 775afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 776aa372e3fSPaul Mullowney 777aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 778aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor; 779087f3262SPaul Mullowney 780c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 78157d48284SJunchao Zhang cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr); 78257d48284SJunchao Zhang cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr); 78357d48284SJunchao Zhang cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr); 78457d48284SJunchao Zhang cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr); 785087f3262SPaul Mullowney } catch(char *ex) { 786087f3262SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 787087f3262SPaul Mullowney } 788087f3262SPaul Mullowney } 789087f3262SPaul Mullowney PetscFunctionReturn(0); 790087f3262SPaul Mullowney } 791087f3262SPaul Mullowney 792087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 7939ae82921SPaul Mullowney { 7949ae82921SPaul Mullowney PetscErrorCode ierr; 795087f3262SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 796087f3262SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 797087f3262SPaul Mullowney IS ip = a->row; 798087f3262SPaul Mullowney const PetscInt *rip; 799087f3262SPaul Mullowney PetscBool perm_identity; 800087f3262SPaul Mullowney PetscInt n = A->rmap->n; 801087f3262SPaul Mullowney 802087f3262SPaul Mullowney PetscFunctionBegin; 803ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr); 804087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr); 805e65717acSKarl Rupp cusparseTriFactors->workVector = new THRUSTARRAY(n); 806aa372e3fSPaul Mullowney cusparseTriFactors->nnz=(a->nz-n)*2 + n; 807aa372e3fSPaul Mullowney 808087f3262SPaul Mullowney /* lower triangular indices */ 809087f3262SPaul Mullowney ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 810087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 811087f3262SPaul Mullowney if (!perm_identity) { 8124e4bbfaaSStefano Zampini IS iip; 8134e4bbfaaSStefano Zampini const PetscInt *irip; 8144e4bbfaaSStefano Zampini 8154e4bbfaaSStefano Zampini ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr); 8164e4bbfaaSStefano Zampini ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr); 817aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 818aa372e3fSPaul Mullowney cusparseTriFactors->rpermIndices->assign(rip, rip+n); 819aa372e3fSPaul Mullowney cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 8204e4bbfaaSStefano Zampini cusparseTriFactors->cpermIndices->assign(irip, irip+n); 8214e4bbfaaSStefano Zampini ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr); 8224e4bbfaaSStefano Zampini ierr = ISDestroy(&iip);CHKERRQ(ierr); 8234863603aSSatish Balay ierr = PetscLogCpuToGpu(2*n*sizeof(PetscInt));CHKERRQ(ierr); 824087f3262SPaul Mullowney } 825087f3262SPaul Mullowney ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 826087f3262SPaul Mullowney PetscFunctionReturn(0); 827087f3262SPaul Mullowney } 828087f3262SPaul Mullowney 8296fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 8309ae82921SPaul Mullowney { 8319ae82921SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 8329ae82921SPaul Mullowney IS isrow = b->row,iscol = b->col; 8339ae82921SPaul Mullowney PetscBool row_identity,col_identity; 834b175d8bbSPaul Mullowney PetscErrorCode ierr; 8359ae82921SPaul Mullowney 8369ae82921SPaul Mullowney PetscFunctionBegin; 8379ae82921SPaul Mullowney ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 838ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 839e057df02SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 8409ae82921SPaul Mullowney ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 8419ae82921SPaul Mullowney ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 842bda325fcSPaul Mullowney if (row_identity && col_identity) { 843bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 844bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8454e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8464e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 847bda325fcSPaul Mullowney } else { 848bda325fcSPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 849bda325fcSPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8504e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8514e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 852bda325fcSPaul Mullowney } 8538dc1d2a3SPaul Mullowney 854e057df02SPaul Mullowney /* get the triangular factors */ 855087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 8569ae82921SPaul Mullowney PetscFunctionReturn(0); 8579ae82921SPaul Mullowney } 8589ae82921SPaul Mullowney 859087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info) 860087f3262SPaul Mullowney { 861087f3262SPaul Mullowney Mat_SeqAIJ *b = (Mat_SeqAIJ*)B->data; 862087f3262SPaul Mullowney IS ip = b->row; 863087f3262SPaul Mullowney PetscBool perm_identity; 864b175d8bbSPaul Mullowney PetscErrorCode ierr; 865087f3262SPaul Mullowney 866087f3262SPaul Mullowney PetscFunctionBegin; 867087f3262SPaul Mullowney ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 868ccdfe979SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 869087f3262SPaul Mullowney /* determine which version of MatSolve needs to be used. */ 870087f3262SPaul Mullowney ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr); 871087f3262SPaul Mullowney if (perm_identity) { 872087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 873087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 8744e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8754e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 876087f3262SPaul Mullowney } else { 877087f3262SPaul Mullowney B->ops->solve = MatSolve_SeqAIJCUSPARSE; 878087f3262SPaul Mullowney B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 8794e4bbfaaSStefano Zampini B->ops->matsolve = NULL; 8804e4bbfaaSStefano Zampini B->ops->matsolvetranspose = NULL; 881087f3262SPaul Mullowney } 882087f3262SPaul Mullowney 883087f3262SPaul Mullowney /* get the triangular factors */ 884087f3262SPaul Mullowney ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr); 885087f3262SPaul Mullowney PetscFunctionReturn(0); 886087f3262SPaul Mullowney } 8879ae82921SPaul Mullowney 888b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 889bda325fcSPaul Mullowney { 890bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 891aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 892aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 893aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 894aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 895bda325fcSPaul Mullowney cusparseStatus_t stat; 896aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 897aa372e3fSPaul Mullowney cusparseMatrixType_t matrixType; 898aa372e3fSPaul Mullowney cusparseFillMode_t fillMode; 899aa372e3fSPaul Mullowney cusparseDiagType_t diagType; 900b175d8bbSPaul Mullowney 901bda325fcSPaul Mullowney PetscFunctionBegin; 902bda325fcSPaul Mullowney 903aa372e3fSPaul Mullowney /*********************************************/ 904aa372e3fSPaul Mullowney /* Now the Transpose of the Lower Tri Factor */ 905aa372e3fSPaul Mullowney /*********************************************/ 906aa372e3fSPaul Mullowney 907aa372e3fSPaul Mullowney /* allocate space for the transpose of the lower triangular factor */ 908aa372e3fSPaul Mullowney loTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 909aa372e3fSPaul Mullowney 910aa372e3fSPaul Mullowney /* set the matrix descriptors of the lower triangular factor */ 911aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(loTriFactor->descr); 912aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 913aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 914aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 915aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(loTriFactor->descr); 916aa372e3fSPaul Mullowney 917aa372e3fSPaul Mullowney /* Create the matrix description */ 91857d48284SJunchao Zhang stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat); 91957d48284SJunchao Zhang stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 92057d48284SJunchao Zhang stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 92157d48284SJunchao Zhang stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 92257d48284SJunchao Zhang stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 923aa372e3fSPaul Mullowney 924aa372e3fSPaul Mullowney /* set the operation */ 925aa372e3fSPaul Mullowney loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 926aa372e3fSPaul Mullowney 927aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the lower triangular factor*/ 928aa372e3fSPaul Mullowney loTriFactorT->csrMat = new CsrMatrix; 929afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 930afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 931aa372e3fSPaul Mullowney loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 932afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1); 933afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 934afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 935aa372e3fSPaul Mullowney 936aa372e3fSPaul Mullowney /* compute the transpose of the lower triangular factor, i.e. the CSC */ 937afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 938afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 939afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 940afb2bd1cSJunchao Zhang loTriFactor->csrMat->values->data().get(), 941afb2bd1cSJunchao Zhang loTriFactor->csrMat->row_offsets->data().get(), 942afb2bd1cSJunchao Zhang loTriFactor->csrMat->column_indices->data().get(), 943afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), 944afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 945afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 946afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 947afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 948afb2bd1cSJunchao Zhang #endif 949afb2bd1cSJunchao Zhang 950aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, 951aa372e3fSPaul Mullowney loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, 952aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 953aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 954aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 955aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 956afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 957afb2bd1cSJunchao Zhang loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 958afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 959afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer 960afb2bd1cSJunchao Zhang #else 961afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 962afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 963afb2bd1cSJunchao Zhang #endif 964afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 965aa372e3fSPaul Mullowney 966afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 967afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 968afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 969afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, 970afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 971afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 972afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, 973afb2bd1cSJunchao Zhang &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 974afb2bd1cSJunchao Zhang cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 975afb2bd1cSJunchao Zhang #endif 976afb2bd1cSJunchao Zhang 977afb2bd1cSJunchao Zhang /* perform the solve analysis */ 978aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, 979afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, 980afb2bd1cSJunchao Zhang loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 981afb2bd1cSJunchao Zhang loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo 982afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 983afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 984afb2bd1cSJunchao Zhang #endif 985afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 986aa372e3fSPaul Mullowney 987aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 988aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 989aa372e3fSPaul Mullowney 990aa372e3fSPaul Mullowney /*********************************************/ 991aa372e3fSPaul Mullowney /* Now the Transpose of the Upper Tri Factor */ 992aa372e3fSPaul Mullowney /*********************************************/ 993aa372e3fSPaul Mullowney 994aa372e3fSPaul Mullowney /* allocate space for the transpose of the upper triangular factor */ 995aa372e3fSPaul Mullowney upTriFactorT = new Mat_SeqAIJCUSPARSETriFactorStruct; 996aa372e3fSPaul Mullowney 997aa372e3fSPaul Mullowney /* set the matrix descriptors of the upper triangular factor */ 998aa372e3fSPaul Mullowney matrixType = cusparseGetMatType(upTriFactor->descr); 999aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1000aa372e3fSPaul Mullowney fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ? 1001aa372e3fSPaul Mullowney CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1002aa372e3fSPaul Mullowney diagType = cusparseGetMatDiagType(upTriFactor->descr); 1003aa372e3fSPaul Mullowney 1004aa372e3fSPaul Mullowney /* Create the matrix description */ 100557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat); 100657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat); 100757d48284SJunchao Zhang stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat); 100857d48284SJunchao Zhang stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat); 100957d48284SJunchao Zhang stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat); 1010aa372e3fSPaul Mullowney 1011aa372e3fSPaul Mullowney /* set the operation */ 1012aa372e3fSPaul Mullowney upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1013aa372e3fSPaul Mullowney 1014aa372e3fSPaul Mullowney /* allocate GPU space for the CSC of the upper triangular factor*/ 1015aa372e3fSPaul Mullowney upTriFactorT->csrMat = new CsrMatrix; 1016afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1017afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1018aa372e3fSPaul Mullowney upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1019afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1); 1020afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1021afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1022aa372e3fSPaul Mullowney 1023aa372e3fSPaul Mullowney /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1024afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1025afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows, 1026afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1027afb2bd1cSJunchao Zhang upTriFactor->csrMat->values->data().get(), 1028afb2bd1cSJunchao Zhang upTriFactor->csrMat->row_offsets->data().get(), 1029afb2bd1cSJunchao Zhang upTriFactor->csrMat->column_indices->data().get(), 1030afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), 1031afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1032afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1033afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1034afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr); 1035afb2bd1cSJunchao Zhang #endif 1036afb2bd1cSJunchao Zhang 1037aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, 1038aa372e3fSPaul Mullowney upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, 1039aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1040aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1041aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1042aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1043afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1044afb2bd1cSJunchao Zhang upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, 1045afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase, 1046afb2bd1cSJunchao Zhang CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer 1047afb2bd1cSJunchao Zhang #else 1048afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1049afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1050afb2bd1cSJunchao Zhang #endif 1051afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1052aa372e3fSPaul Mullowney 1053afb2bd1cSJunchao Zhang /* Create the solve analysis information */ 1054afb2bd1cSJunchao Zhang stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat); 1055afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1056afb2bd1cSJunchao Zhang stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, 1057afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1058afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1059afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, 1060afb2bd1cSJunchao Zhang &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat); 1061afb2bd1cSJunchao Zhang cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr); 1062afb2bd1cSJunchao Zhang #endif 1063afb2bd1cSJunchao Zhang 1064afb2bd1cSJunchao Zhang /* perform the solve analysis */ 1065aa372e3fSPaul Mullowney stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, 1066afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, 1067afb2bd1cSJunchao Zhang upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1068afb2bd1cSJunchao Zhang upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo 1069afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1070afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1071afb2bd1cSJunchao Zhang #endif 1072afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1073aa372e3fSPaul Mullowney 1074aa372e3fSPaul Mullowney /* assign the pointer. Is this really necessary? */ 1075aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1076bda325fcSPaul Mullowney PetscFunctionReturn(0); 1077bda325fcSPaul Mullowney } 1078bda325fcSPaul Mullowney 1079b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A) 1080bda325fcSPaul Mullowney { 1081aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 1082aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 1083aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSEMultStruct *matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 1084bda325fcSPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1085bda325fcSPaul Mullowney cusparseStatus_t stat; 1086aa372e3fSPaul Mullowney cusparseIndexBase_t indexBase; 1087b06137fdSPaul Mullowney cudaError_t err; 108885ba7357SStefano Zampini PetscErrorCode ierr; 1089b175d8bbSPaul Mullowney 1090bda325fcSPaul Mullowney PetscFunctionBegin; 109185ba7357SStefano Zampini if (!cusparsestruct->transgen || cusparsestruct->matTranspose) PetscFunctionReturn(0); 109285ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 109385ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 109485ba7357SStefano Zampini /* create cusparse matrix */ 1095aa372e3fSPaul Mullowney matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 109657d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat); 1097aa372e3fSPaul Mullowney indexBase = cusparseGetMatIndexBase(matstruct->descr); 109857d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat); 109957d48284SJunchao Zhang stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 1100aa372e3fSPaul Mullowney 1101b06137fdSPaul Mullowney /* set alpha and beta */ 1102afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstructT->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 11037656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 11047656d835SStefano Zampini err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1105afb2bd1cSJunchao Zhang err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 11067656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 11077656d835SStefano Zampini err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 110857d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1109b06137fdSPaul Mullowney 1110aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1111aa372e3fSPaul Mullowney CsrMatrix *matrix = (CsrMatrix*)matstruct->mat; 1112aa372e3fSPaul Mullowney CsrMatrix *matrixT= new CsrMatrix; 1113554b8892SKarl Rupp matrixT->num_rows = A->cmap->n; 1114554b8892SKarl Rupp matrixT->num_cols = A->rmap->n; 1115aa372e3fSPaul Mullowney matrixT->num_entries = a->nz; 1116a8bd5306SMark Adams matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1); 1117aa372e3fSPaul Mullowney matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1118aa372e3fSPaul Mullowney matrixT->values = new THRUSTARRAY(a->nz); 1119a3fdcf43SKarl Rupp 112081902715SJunchao Zhang cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 112181902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1); 1122afb2bd1cSJunchao Zhang 112381902715SJunchao Zhang /* compute the transpose, i.e. the CSC */ 1124afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1125afb2bd1cSJunchao Zhang stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, 1126afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1127afb2bd1cSJunchao Zhang matrix->values->data().get(), 1128afb2bd1cSJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1129afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1130afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1131afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1132afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1133afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, &cusparsestruct->csr2cscBufferSize);CHKERRCUSPARSE(stat); 1134afb2bd1cSJunchao Zhang err = cudaMalloc(&cusparsestruct->csr2cscBuffer,cusparsestruct->csr2cscBufferSize);CHKERRCUDA(err); 1135afb2bd1cSJunchao Zhang #endif 1136afb2bd1cSJunchao Zhang 1137a3fdcf43SKarl Rupp stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1138a3fdcf43SKarl Rupp A->cmap->n, matrix->num_entries, 1139aa372e3fSPaul Mullowney matrix->values->data().get(), 114081902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1141aa372e3fSPaul Mullowney matrix->column_indices->data().get(), 1142aa372e3fSPaul Mullowney matrixT->values->data().get(), 1143afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1144afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1145afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1146afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1147afb2bd1cSJunchao Zhang #else 1148afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1149afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1150afb2bd1cSJunchao Zhang #endif 1151afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1152aa372e3fSPaul Mullowney matstructT->mat = matrixT; 1153afb2bd1cSJunchao Zhang 1154afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1155afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstructT->matDescr, 1156afb2bd1cSJunchao Zhang matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, 1157afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), 1158afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1159afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1160afb2bd1cSJunchao Zhang indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat); 1161afb2bd1cSJunchao Zhang #endif 1162aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1163afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1164afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1165afb2bd1cSJunchao Zhang #else 1166aa372e3fSPaul Mullowney CsrMatrix *temp = new CsrMatrix; 116751c6d536SStefano Zampini CsrMatrix *tempT = new CsrMatrix; 116851c6d536SStefano Zampini /* First convert HYB to CSR */ 1169aa372e3fSPaul Mullowney temp->num_rows = A->rmap->n; 1170aa372e3fSPaul Mullowney temp->num_cols = A->cmap->n; 1171aa372e3fSPaul Mullowney temp->num_entries = a->nz; 1172aa372e3fSPaul Mullowney temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1173aa372e3fSPaul Mullowney temp->column_indices = new THRUSTINTARRAY32(a->nz); 1174aa372e3fSPaul Mullowney temp->values = new THRUSTARRAY(a->nz); 1175aa372e3fSPaul Mullowney 1176aa372e3fSPaul Mullowney stat = cusparse_hyb2csr(cusparsestruct->handle, 1177aa372e3fSPaul Mullowney matstruct->descr, (cusparseHybMat_t)matstruct->mat, 1178aa372e3fSPaul Mullowney temp->values->data().get(), 1179aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 118057d48284SJunchao Zhang temp->column_indices->data().get());CHKERRCUSPARSE(stat); 1181aa372e3fSPaul Mullowney 1182aa372e3fSPaul Mullowney /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1183aa372e3fSPaul Mullowney tempT->num_rows = A->rmap->n; 1184aa372e3fSPaul Mullowney tempT->num_cols = A->cmap->n; 1185aa372e3fSPaul Mullowney tempT->num_entries = a->nz; 1186aa372e3fSPaul Mullowney tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1); 1187aa372e3fSPaul Mullowney tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1188aa372e3fSPaul Mullowney tempT->values = new THRUSTARRAY(a->nz); 1189aa372e3fSPaul Mullowney 1190aa372e3fSPaul Mullowney stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, 1191aa372e3fSPaul Mullowney temp->num_cols, temp->num_entries, 1192aa372e3fSPaul Mullowney temp->values->data().get(), 1193aa372e3fSPaul Mullowney temp->row_offsets->data().get(), 1194aa372e3fSPaul Mullowney temp->column_indices->data().get(), 1195aa372e3fSPaul Mullowney tempT->values->data().get(), 1196aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 1197aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 119857d48284SJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat); 1199aa372e3fSPaul Mullowney 1200aa372e3fSPaul Mullowney /* Last, convert CSC to HYB */ 1201aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 120257d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1203aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1204aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1205aa372e3fSPaul Mullowney stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, 1206aa372e3fSPaul Mullowney matstructT->descr, tempT->values->data().get(), 1207aa372e3fSPaul Mullowney tempT->row_offsets->data().get(), 1208aa372e3fSPaul Mullowney tempT->column_indices->data().get(), 120957d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1210aa372e3fSPaul Mullowney 1211aa372e3fSPaul Mullowney /* assign the pointer */ 1212aa372e3fSPaul Mullowney matstructT->mat = hybMat; 1213aa372e3fSPaul Mullowney /* delete temporaries */ 1214aa372e3fSPaul Mullowney if (tempT) { 1215aa372e3fSPaul Mullowney if (tempT->values) delete (THRUSTARRAY*) tempT->values; 1216aa372e3fSPaul Mullowney if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices; 1217aa372e3fSPaul Mullowney if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets; 1218aa372e3fSPaul Mullowney delete (CsrMatrix*) tempT; 1219087f3262SPaul Mullowney } 1220aa372e3fSPaul Mullowney if (temp) { 1221aa372e3fSPaul Mullowney if (temp->values) delete (THRUSTARRAY*) temp->values; 1222aa372e3fSPaul Mullowney if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices; 1223aa372e3fSPaul Mullowney if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets; 1224aa372e3fSPaul Mullowney delete (CsrMatrix*) temp; 1225aa372e3fSPaul Mullowney } 1226afb2bd1cSJunchao Zhang #endif 1227aa372e3fSPaul Mullowney } 122805035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 122985ba7357SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 123085ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 1231213423ffSJunchao Zhang /* the compressed row indices is not used for matTranspose */ 1232213423ffSJunchao Zhang matstructT->cprowIndices = NULL; 1233aa372e3fSPaul Mullowney /* assign the pointer */ 1234aa372e3fSPaul Mullowney ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT; 1235bda325fcSPaul Mullowney PetscFunctionReturn(0); 1236bda325fcSPaul Mullowney } 1237bda325fcSPaul Mullowney 12384e4bbfaaSStefano Zampini /* Why do we need to analyze the tranposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 12396fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 1240bda325fcSPaul Mullowney { 1241c41cb2e2SAlejandro Lamas Daviña PetscInt n = xx->map->n; 1242465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1243465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1244465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1245465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 1246bda325fcSPaul Mullowney cusparseStatus_t stat; 1247bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1248aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1249aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1250aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1251b175d8bbSPaul Mullowney PetscErrorCode ierr; 125257d48284SJunchao Zhang cudaError_t cerr; 1253bda325fcSPaul Mullowney 1254bda325fcSPaul Mullowney PetscFunctionBegin; 1255aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1256aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1257bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1258aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1259aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1260bda325fcSPaul Mullowney } 1261bda325fcSPaul Mullowney 1262bda325fcSPaul Mullowney /* Get the GPU pointers */ 1263c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1264c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1265c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1266c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 1267bda325fcSPaul Mullowney 12687a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1269aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1270c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1271c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()), 1272c41cb2e2SAlejandro Lamas Daviña xGPU); 1273aa372e3fSPaul Mullowney 1274aa372e3fSPaul Mullowney /* First, solve U */ 1275aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1276afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 1277afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1278afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1279afb2bd1cSJunchao Zhang #endif 1280afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1281aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1282aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1283aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1284aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1285afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 1286afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1287afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1288afb2bd1cSJunchao Zhang #endif 1289afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1290aa372e3fSPaul Mullowney 1291aa372e3fSPaul Mullowney /* Then, solve L */ 1292aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1293afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 1294afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1295afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1296afb2bd1cSJunchao Zhang #endif 1297afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1298aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1299aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1300aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1301aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1302afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1303afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1304afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1305afb2bd1cSJunchao Zhang #endif 1306afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1307aa372e3fSPaul Mullowney 1308aa372e3fSPaul Mullowney /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1309c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), 1310c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()), 1311aa372e3fSPaul Mullowney tempGPU->begin()); 1312aa372e3fSPaul Mullowney 1313aa372e3fSPaul Mullowney /* Copy the temporary to the full solution. */ 1314c41cb2e2SAlejandro Lamas Daviña thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU); 1315bda325fcSPaul Mullowney 1316bda325fcSPaul Mullowney /* restore */ 1317c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1318c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 131905035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1320661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1321958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1322bda325fcSPaul Mullowney PetscFunctionReturn(0); 1323bda325fcSPaul Mullowney } 1324bda325fcSPaul Mullowney 13256fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 1326bda325fcSPaul Mullowney { 1327465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1328465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1329bda325fcSPaul Mullowney cusparseStatus_t stat; 1330bda325fcSPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1331aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1332aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1333aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1334b175d8bbSPaul Mullowney PetscErrorCode ierr; 133557d48284SJunchao Zhang cudaError_t cerr; 1336bda325fcSPaul Mullowney 1337bda325fcSPaul Mullowney PetscFunctionBegin; 1338aa372e3fSPaul Mullowney /* Analyze the matrix and create the transpose ... on the fly */ 1339aa372e3fSPaul Mullowney if (!loTriFactorT && !upTriFactorT) { 1340bda325fcSPaul Mullowney ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr); 1341aa372e3fSPaul Mullowney loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose; 1342aa372e3fSPaul Mullowney upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose; 1343bda325fcSPaul Mullowney } 1344bda325fcSPaul Mullowney 1345bda325fcSPaul Mullowney /* Get the GPU pointers */ 1346c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1347c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1348bda325fcSPaul Mullowney 13497a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1350aa372e3fSPaul Mullowney /* First, solve U */ 1351aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, 1352afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_rows, 1353afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1354afb2bd1cSJunchao Zhang upTriFactorT->csrMat->num_entries, 1355afb2bd1cSJunchao Zhang #endif 1356afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactorT->descr, 1357aa372e3fSPaul Mullowney upTriFactorT->csrMat->values->data().get(), 1358aa372e3fSPaul Mullowney upTriFactorT->csrMat->row_offsets->data().get(), 1359aa372e3fSPaul Mullowney upTriFactorT->csrMat->column_indices->data().get(), 1360aa372e3fSPaul Mullowney upTriFactorT->solveInfo, 1361afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 1362afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1363afb2bd1cSJunchao Zhang ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer 1364afb2bd1cSJunchao Zhang #endif 1365afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1366aa372e3fSPaul Mullowney 1367aa372e3fSPaul Mullowney /* Then, solve L */ 1368aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, 1369afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_rows, 1370afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1371afb2bd1cSJunchao Zhang loTriFactorT->csrMat->num_entries, 1372afb2bd1cSJunchao Zhang #endif 1373afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactorT->descr, 1374aa372e3fSPaul Mullowney loTriFactorT->csrMat->values->data().get(), 1375aa372e3fSPaul Mullowney loTriFactorT->csrMat->row_offsets->data().get(), 1376aa372e3fSPaul Mullowney loTriFactorT->csrMat->column_indices->data().get(), 1377aa372e3fSPaul Mullowney loTriFactorT->solveInfo, 1378afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1379afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1380afb2bd1cSJunchao Zhang ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer 1381afb2bd1cSJunchao Zhang #endif 1382afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1383bda325fcSPaul Mullowney 1384bda325fcSPaul Mullowney /* restore */ 1385c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1386c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 138705035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1388661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1389958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 1390bda325fcSPaul Mullowney PetscFunctionReturn(0); 1391bda325fcSPaul Mullowney } 1392bda325fcSPaul Mullowney 13936fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx) 13949ae82921SPaul Mullowney { 1395465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1396465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 1397465f34aeSAlejandro Lamas Daviña thrust::device_ptr<const PetscScalar> bGPU; 1398465f34aeSAlejandro Lamas Daviña thrust::device_ptr<PetscScalar> xGPU; 13999ae82921SPaul Mullowney cusparseStatus_t stat; 14009ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1401aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1402aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1403aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1404b175d8bbSPaul Mullowney PetscErrorCode ierr; 140557d48284SJunchao Zhang cudaError_t cerr; 14069ae82921SPaul Mullowney 14079ae82921SPaul Mullowney PetscFunctionBegin; 1408ebc8f436SDominic Meiser 1409e057df02SPaul Mullowney /* Get the GPU pointers */ 1410c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1411c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 1412c41cb2e2SAlejandro Lamas Daviña xGPU = thrust::device_pointer_cast(xarray); 1413c41cb2e2SAlejandro Lamas Daviña bGPU = thrust::device_pointer_cast(barray); 14149ae82921SPaul Mullowney 14157a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1416aa372e3fSPaul Mullowney /* First, reorder with the row permutation */ 1417c41cb2e2SAlejandro Lamas Daviña thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), 1418c41cb2e2SAlejandro Lamas Daviña thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), 14194e4bbfaaSStefano Zampini tempGPU->begin()); 1420aa372e3fSPaul Mullowney 1421aa372e3fSPaul Mullowney /* Next, solve L */ 1422aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1423afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 1424afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1425afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1426afb2bd1cSJunchao Zhang #endif 1427afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1428aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1429aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1430aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1431aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1432afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1433afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1434afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1435afb2bd1cSJunchao Zhang #endif 1436afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1437aa372e3fSPaul Mullowney 1438aa372e3fSPaul Mullowney /* Then, solve U */ 1439aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1440afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 1441afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1442afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1443afb2bd1cSJunchao Zhang #endif 1444afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1445aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1446aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1447aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1448aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1449afb2bd1cSJunchao Zhang xarray, tempGPU->data().get() 1450afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1451afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1452afb2bd1cSJunchao Zhang #endif 1453afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1454aa372e3fSPaul Mullowney 14554e4bbfaaSStefano Zampini /* Last, reorder with the column permutation */ 14564e4bbfaaSStefano Zampini thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), 14574e4bbfaaSStefano Zampini thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), 14584e4bbfaaSStefano Zampini xGPU); 14599ae82921SPaul Mullowney 1460c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1461c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 146205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1463661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1464958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 14659ae82921SPaul Mullowney PetscFunctionReturn(0); 14669ae82921SPaul Mullowney } 14679ae82921SPaul Mullowney 14686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx) 14699ae82921SPaul Mullowney { 1470465f34aeSAlejandro Lamas Daviña const PetscScalar *barray; 1471465f34aeSAlejandro Lamas Daviña PetscScalar *xarray; 14729ae82921SPaul Mullowney cusparseStatus_t stat; 14739ae82921SPaul Mullowney Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr; 1474aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr; 1475aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr; 1476aa372e3fSPaul Mullowney THRUSTARRAY *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector; 1477b175d8bbSPaul Mullowney PetscErrorCode ierr; 147857d48284SJunchao Zhang cudaError_t cerr; 14799ae82921SPaul Mullowney 14809ae82921SPaul Mullowney PetscFunctionBegin; 1481e057df02SPaul Mullowney /* Get the GPU pointers */ 1482c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr); 1483c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr); 14849ae82921SPaul Mullowney 14857a052e47Shannah_mairs ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1486aa372e3fSPaul Mullowney /* First, solve L */ 1487aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp, 1488afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_rows, 1489afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1490afb2bd1cSJunchao Zhang loTriFactor->csrMat->num_entries, 1491afb2bd1cSJunchao Zhang #endif 1492afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, loTriFactor->descr, 1493aa372e3fSPaul Mullowney loTriFactor->csrMat->values->data().get(), 1494aa372e3fSPaul Mullowney loTriFactor->csrMat->row_offsets->data().get(), 1495aa372e3fSPaul Mullowney loTriFactor->csrMat->column_indices->data().get(), 1496aa372e3fSPaul Mullowney loTriFactor->solveInfo, 1497afb2bd1cSJunchao Zhang barray, tempGPU->data().get() 1498afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1499afb2bd1cSJunchao Zhang ,loTriFactor->solvePolicy, loTriFactor->solveBuffer 1500afb2bd1cSJunchao Zhang #endif 1501afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 1502aa372e3fSPaul Mullowney 1503aa372e3fSPaul Mullowney /* Next, solve U */ 1504aa372e3fSPaul Mullowney stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp, 1505afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_rows, 1506afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1507afb2bd1cSJunchao Zhang upTriFactor->csrMat->num_entries, 1508afb2bd1cSJunchao Zhang #endif 1509afb2bd1cSJunchao Zhang &PETSC_CUSPARSE_ONE, upTriFactor->descr, 1510aa372e3fSPaul Mullowney upTriFactor->csrMat->values->data().get(), 1511aa372e3fSPaul Mullowney upTriFactor->csrMat->row_offsets->data().get(), 1512aa372e3fSPaul Mullowney upTriFactor->csrMat->column_indices->data().get(), 1513aa372e3fSPaul Mullowney upTriFactor->solveInfo, 1514afb2bd1cSJunchao Zhang tempGPU->data().get(), xarray 1515afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1516afb2bd1cSJunchao Zhang ,upTriFactor->solvePolicy, upTriFactor->solveBuffer 1517afb2bd1cSJunchao Zhang #endif 1518afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 15199ae82921SPaul Mullowney 1520c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr); 1521c41cb2e2SAlejandro Lamas Daviña ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr); 152205035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1523661c2d29Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1524958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr); 15259ae82921SPaul Mullowney PetscFunctionReturn(0); 15269ae82921SPaul Mullowney } 15279ae82921SPaul Mullowney 15286fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 15299ae82921SPaul Mullowney { 1530aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 15317c700b8dSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 15329ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1533213423ffSJunchao Zhang PetscInt m = A->rmap->n,*ii,*ridx,tmp; 15349ae82921SPaul Mullowney PetscErrorCode ierr; 1535aa372e3fSPaul Mullowney cusparseStatus_t stat; 1536b06137fdSPaul Mullowney cudaError_t err; 15379ae82921SPaul Mullowney 15389ae82921SPaul Mullowney PetscFunctionBegin; 153995639643SRichard Tran Mills if (A->boundtocpu) PetscFunctionReturn(0); 1540c70f7ee4SJunchao Zhang if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 154181902715SJunchao Zhang if (A->was_assembled && A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { 154281902715SJunchao Zhang /* Copy values only */ 1543afb2bd1cSJunchao Zhang CsrMatrix *matrix,*matrixT; 1544afb2bd1cSJunchao Zhang matrix = (CsrMatrix*)cusparsestruct->mat->mat; 154585ba7357SStefano Zampini 154685ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 1547afb2bd1cSJunchao Zhang matrix->values->assign(a->a, a->a+a->nz); 154805035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 15494863603aSSatish Balay ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr); 155085ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 155181902715SJunchao Zhang 155281902715SJunchao Zhang /* Update matT when it was built before */ 155381902715SJunchao Zhang if (cusparsestruct->matTranspose) { 155481902715SJunchao Zhang cusparseIndexBase_t indexBase = cusparseGetMatIndexBase(cusparsestruct->mat->descr); 1555afb2bd1cSJunchao Zhang matrixT = (CsrMatrix*)cusparsestruct->matTranspose->mat; 155685ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 155781902715SJunchao Zhang stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, 1558afb2bd1cSJunchao Zhang A->cmap->n, matrix->num_entries, 1559afb2bd1cSJunchao Zhang matrix->values->data().get(), 156081902715SJunchao Zhang cusparsestruct->rowoffsets_gpu->data().get(), 1561afb2bd1cSJunchao Zhang matrix->column_indices->data().get(), 1562afb2bd1cSJunchao Zhang matrixT->values->data().get(), 1563afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1564afb2bd1cSJunchao Zhang matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, 1565afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC,indexBase, 1566afb2bd1cSJunchao Zhang cusparsestruct->csr2cscAlg, cusparsestruct->csr2cscBuffer 1567afb2bd1cSJunchao Zhang #else 1568afb2bd1cSJunchao Zhang matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), 1569afb2bd1cSJunchao Zhang CUSPARSE_ACTION_NUMERIC, indexBase 1570afb2bd1cSJunchao Zhang #endif 1571afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat); 157205035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 157385ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr); 157481902715SJunchao Zhang } 157534d6c7a5SJose E. Roman } else { 157685ba7357SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 15777c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr); 15787c700b8dSJunchao Zhang ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->matTranspose,cusparsestruct->format);CHKERRQ(ierr); 15797c700b8dSJunchao Zhang delete cusparsestruct->workVector; 158081902715SJunchao Zhang delete cusparsestruct->rowoffsets_gpu; 15819ae82921SPaul Mullowney try { 15829ae82921SPaul Mullowney if (a->compressedrow.use) { 15839ae82921SPaul Mullowney m = a->compressedrow.nrows; 15849ae82921SPaul Mullowney ii = a->compressedrow.i; 15859ae82921SPaul Mullowney ridx = a->compressedrow.rindex; 15869ae82921SPaul Mullowney } else { 1587213423ffSJunchao Zhang m = A->rmap->n; 1588213423ffSJunchao Zhang ii = a->i; 1589e6e9a74fSStefano Zampini ridx = NULL; 15909ae82921SPaul Mullowney } 1591213423ffSJunchao Zhang cusparsestruct->nrows = m; 15929ae82921SPaul Mullowney 159385ba7357SStefano Zampini /* create cusparse matrix */ 1594aa372e3fSPaul Mullowney matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 159557d48284SJunchao Zhang stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat); 159657d48284SJunchao Zhang stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat); 159757d48284SJunchao Zhang stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat); 15989ae82921SPaul Mullowney 1599afb2bd1cSJunchao Zhang err = cudaMalloc((void **)&(matstruct->alpha_one), sizeof(PetscScalar));CHKERRCUDA(err); 16007656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err); 16017656d835SStefano Zampini err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err); 1602afb2bd1cSJunchao Zhang err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 16037656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 16047656d835SStefano Zampini err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err); 160557d48284SJunchao Zhang stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat); 1606b06137fdSPaul Mullowney 1607aa372e3fSPaul Mullowney /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 1608aa372e3fSPaul Mullowney if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 1609aa372e3fSPaul Mullowney /* set the matrix */ 1610afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1611afb2bd1cSJunchao Zhang mat->num_rows = m; 1612afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1613afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1614afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1615afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 16169ae82921SPaul Mullowney 1617afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1618afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1619aa372e3fSPaul Mullowney 1620afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1621afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1622aa372e3fSPaul Mullowney 1623aa372e3fSPaul Mullowney /* assign the pointer */ 1624afb2bd1cSJunchao Zhang matstruct->mat = mat; 1625afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1626afb2bd1cSJunchao Zhang if (mat->num_rows) { /* cusparse errors on empty matrices! */ 1627afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&matstruct->matDescr, 1628afb2bd1cSJunchao Zhang mat->num_rows, mat->num_cols, mat->num_entries, 1629afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), mat->column_indices->data().get(), 1630afb2bd1cSJunchao Zhang mat->values->data().get(), 1631afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1632afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1633afb2bd1cSJunchao Zhang } 1634afb2bd1cSJunchao Zhang #endif 1635aa372e3fSPaul Mullowney } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) { 1636afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1637afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1638afb2bd1cSJunchao Zhang #else 1639afb2bd1cSJunchao Zhang CsrMatrix *mat= new CsrMatrix; 1640afb2bd1cSJunchao Zhang mat->num_rows = m; 1641afb2bd1cSJunchao Zhang mat->num_cols = A->cmap->n; 1642afb2bd1cSJunchao Zhang mat->num_entries = a->nz; 1643afb2bd1cSJunchao Zhang mat->row_offsets = new THRUSTINTARRAY32(m+1); 1644afb2bd1cSJunchao Zhang mat->row_offsets->assign(ii, ii + m+1); 1645aa372e3fSPaul Mullowney 1646afb2bd1cSJunchao Zhang mat->column_indices = new THRUSTINTARRAY32(a->nz); 1647afb2bd1cSJunchao Zhang mat->column_indices->assign(a->j, a->j+a->nz); 1648aa372e3fSPaul Mullowney 1649afb2bd1cSJunchao Zhang mat->values = new THRUSTARRAY(a->nz); 1650afb2bd1cSJunchao Zhang mat->values->assign(a->a, a->a+a->nz); 1651aa372e3fSPaul Mullowney 1652aa372e3fSPaul Mullowney cusparseHybMat_t hybMat; 165357d48284SJunchao Zhang stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat); 1654aa372e3fSPaul Mullowney cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ? 1655aa372e3fSPaul Mullowney CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1656afb2bd1cSJunchao Zhang stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, 1657afb2bd1cSJunchao Zhang matstruct->descr, mat->values->data().get(), 1658afb2bd1cSJunchao Zhang mat->row_offsets->data().get(), 1659afb2bd1cSJunchao Zhang mat->column_indices->data().get(), 166057d48284SJunchao Zhang hybMat, 0, partition);CHKERRCUSPARSE(stat); 1661aa372e3fSPaul Mullowney /* assign the pointer */ 1662aa372e3fSPaul Mullowney matstruct->mat = hybMat; 1663aa372e3fSPaul Mullowney 1664afb2bd1cSJunchao Zhang if (mat) { 1665afb2bd1cSJunchao Zhang if (mat->values) delete (THRUSTARRAY*)mat->values; 1666afb2bd1cSJunchao Zhang if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices; 1667afb2bd1cSJunchao Zhang if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets; 1668afb2bd1cSJunchao Zhang delete (CsrMatrix*)mat; 1669087f3262SPaul Mullowney } 1670afb2bd1cSJunchao Zhang #endif 1671087f3262SPaul Mullowney } 1672ca45077fSPaul Mullowney 1673aa372e3fSPaul Mullowney /* assign the compressed row indices */ 1674213423ffSJunchao Zhang if (a->compressedrow.use) { 1675213423ffSJunchao Zhang cusparsestruct->workVector = new THRUSTARRAY(m); 1676aa372e3fSPaul Mullowney matstruct->cprowIndices = new THRUSTINTARRAY(m); 1677aa372e3fSPaul Mullowney matstruct->cprowIndices->assign(ridx,ridx+m); 1678213423ffSJunchao Zhang tmp = m; 1679213423ffSJunchao Zhang } else { 1680213423ffSJunchao Zhang cusparsestruct->workVector = NULL; 1681213423ffSJunchao Zhang matstruct->cprowIndices = NULL; 1682213423ffSJunchao Zhang tmp = 0; 1683213423ffSJunchao Zhang } 1684213423ffSJunchao Zhang ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr); 1685aa372e3fSPaul Mullowney 1686aa372e3fSPaul Mullowney /* assign the pointer */ 1687aa372e3fSPaul Mullowney cusparsestruct->mat = matstruct; 16889ae82921SPaul Mullowney } catch(char *ex) { 16899ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 16909ae82921SPaul Mullowney } 169105035670SJunchao Zhang err = WaitForCUDA();CHKERRCUDA(err); 169285ba7357SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr); 169334d6c7a5SJose E. Roman cusparsestruct->nonzerostate = A->nonzerostate; 169434d6c7a5SJose E. Roman } 1695c70f7ee4SJunchao Zhang A->offloadmask = PETSC_OFFLOAD_BOTH; 16969ae82921SPaul Mullowney } 16979ae82921SPaul Mullowney PetscFunctionReturn(0); 16989ae82921SPaul Mullowney } 16999ae82921SPaul Mullowney 1700c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals 1701aa372e3fSPaul Mullowney { 1702aa372e3fSPaul Mullowney template <typename Tuple> 1703aa372e3fSPaul Mullowney __host__ __device__ 1704aa372e3fSPaul Mullowney void operator()(Tuple t) 1705aa372e3fSPaul Mullowney { 1706aa372e3fSPaul Mullowney thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 1707aa372e3fSPaul Mullowney } 1708aa372e3fSPaul Mullowney }; 1709aa372e3fSPaul Mullowney 1710e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse 1711e6e9a74fSStefano Zampini { 1712e6e9a74fSStefano Zampini template <typename Tuple> 1713e6e9a74fSStefano Zampini __host__ __device__ 1714e6e9a74fSStefano Zampini void operator()(Tuple t) 1715e6e9a74fSStefano Zampini { 1716e6e9a74fSStefano Zampini thrust::get<0>(t) = thrust::get<1>(t); 1717e6e9a74fSStefano Zampini } 1718e6e9a74fSStefano Zampini }; 1719e6e9a74fSStefano Zampini 1720afb2bd1cSJunchao Zhang struct MatMatCusparse { 1721ccdfe979SStefano Zampini PetscBool cisdense; 1722ccdfe979SStefano Zampini PetscScalar *Bt; 1723ccdfe979SStefano Zampini Mat X; 1724afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1725afb2bd1cSJunchao Zhang PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 1726afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matBDescr; 1727afb2bd1cSJunchao Zhang cusparseDnMatDescr_t matCDescr; 1728afb2bd1cSJunchao Zhang size_t spmmBufferSize; 1729afb2bd1cSJunchao Zhang void *spmmBuffer; 1730afb2bd1cSJunchao Zhang PetscInt Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/ 1731afb2bd1cSJunchao Zhang #endif 1732afb2bd1cSJunchao Zhang }; 1733ccdfe979SStefano Zampini 1734ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data) 1735ccdfe979SStefano Zampini { 1736ccdfe979SStefano Zampini PetscErrorCode ierr; 1737ccdfe979SStefano Zampini MatMatCusparse *mmdata = (MatMatCusparse *)data; 1738ccdfe979SStefano Zampini cudaError_t cerr; 1739ccdfe979SStefano Zampini 1740ccdfe979SStefano Zampini PetscFunctionBegin; 1741ccdfe979SStefano Zampini cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr); 1742afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1743afb2bd1cSJunchao Zhang cusparseStatus_t stat; 1744afb2bd1cSJunchao Zhang if (mmdata->matBDescr) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat);} 1745afb2bd1cSJunchao Zhang if (mmdata->matCDescr) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat);} 1746afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1747afb2bd1cSJunchao Zhang #endif 1748ccdfe979SStefano Zampini ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr); 1749ccdfe979SStefano Zampini ierr = PetscFree(data);CHKERRQ(ierr); 1750ccdfe979SStefano Zampini PetscFunctionReturn(0); 1751ccdfe979SStefano Zampini } 1752ccdfe979SStefano Zampini 1753ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool); 1754ccdfe979SStefano Zampini 1755ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1756ccdfe979SStefano Zampini { 1757ccdfe979SStefano Zampini Mat_Product *product = C->product; 1758ccdfe979SStefano Zampini Mat A,B; 1759afb2bd1cSJunchao Zhang PetscInt m,n,blda,clda; 1760ccdfe979SStefano Zampini PetscBool flg,biscuda; 1761ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1762ccdfe979SStefano Zampini cusparseStatus_t stat; 1763ccdfe979SStefano Zampini cusparseOperation_t opA; 1764ccdfe979SStefano Zampini const PetscScalar *barray; 1765ccdfe979SStefano Zampini PetscScalar *carray; 1766ccdfe979SStefano Zampini PetscErrorCode ierr; 1767ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1768ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSEMultStruct *mat; 1769ccdfe979SStefano Zampini CsrMatrix *csrmat; 1770afb2bd1cSJunchao Zhang cudaError_t cerr; 1771ccdfe979SStefano Zampini 1772ccdfe979SStefano Zampini PetscFunctionBegin; 1773ccdfe979SStefano Zampini MatCheckProduct(C,1); 1774ccdfe979SStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 1775ccdfe979SStefano Zampini mmdata = (MatMatCusparse*)product->data; 1776ccdfe979SStefano Zampini A = product->A; 1777ccdfe979SStefano Zampini B = product->B; 1778ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1779ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1780ccdfe979SStefano Zampini /* currently CopyToGpu does not copy if the matrix is bound to CPU 1781ccdfe979SStefano Zampini Instead of silently accepting the wrong answer, I prefer to raise the error */ 1782ccdfe979SStefano Zampini if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 1783ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 1784ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1785ccdfe979SStefano Zampini switch (product->type) { 1786ccdfe979SStefano Zampini case MATPRODUCT_AB: 1787ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1788ccdfe979SStefano Zampini mat = cusp->mat; 1789ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1790ccdfe979SStefano Zampini m = A->rmap->n; 1791ccdfe979SStefano Zampini n = B->cmap->n; 1792ccdfe979SStefano Zampini break; 1793ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1794e6e9a74fSStefano Zampini if (!cusp->transgen) { 1795e6e9a74fSStefano Zampini mat = cusp->mat; 1796e6e9a74fSStefano Zampini opA = CUSPARSE_OPERATION_TRANSPOSE; 1797e6e9a74fSStefano Zampini } else { 1798ccdfe979SStefano Zampini ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr); 1799ccdfe979SStefano Zampini mat = cusp->matTranspose; 1800ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1801e6e9a74fSStefano Zampini } 1802ccdfe979SStefano Zampini m = A->cmap->n; 1803ccdfe979SStefano Zampini n = B->cmap->n; 1804ccdfe979SStefano Zampini break; 1805ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1806ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1807ccdfe979SStefano Zampini mat = cusp->mat; 1808ccdfe979SStefano Zampini opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 1809ccdfe979SStefano Zampini m = A->rmap->n; 1810ccdfe979SStefano Zampini n = B->rmap->n; 1811ccdfe979SStefano Zampini break; 1812ccdfe979SStefano Zampini default: 1813ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1814ccdfe979SStefano Zampini } 1815ccdfe979SStefano Zampini if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct"); 1816ccdfe979SStefano Zampini csrmat = (CsrMatrix*)mat->mat; 1817ccdfe979SStefano Zampini /* if the user passed a CPU matrix, copy the data to the GPU */ 1818ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr); 1819afb2bd1cSJunchao Zhang if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);} 1820ccdfe979SStefano Zampini ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr); 1821afb2bd1cSJunchao Zhang 1822ccdfe979SStefano Zampini ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr); 1823c8378d12SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1824c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1825c8378d12SStefano Zampini ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr); 1826c8378d12SStefano Zampini } else { 1827c8378d12SStefano Zampini ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr); 1828c8378d12SStefano Zampini ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr); 1829c8378d12SStefano Zampini } 1830c8378d12SStefano Zampini 1831c8378d12SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1832afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 1833afb2bd1cSJunchao Zhang cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 1834afb2bd1cSJunchao Zhang /* (re)allcoate spmmBuffer if not initialized or LDAs are different */ 1835afb2bd1cSJunchao Zhang if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 1836afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;} 1837afb2bd1cSJunchao Zhang if (!mmdata->matBDescr) { 1838afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1839afb2bd1cSJunchao Zhang mmdata->Blda = blda; 1840afb2bd1cSJunchao Zhang } 1841c8378d12SStefano Zampini 1842afb2bd1cSJunchao Zhang if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;} 1843afb2bd1cSJunchao Zhang if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 1844afb2bd1cSJunchao Zhang stat = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat); 1845afb2bd1cSJunchao Zhang mmdata->Clda = clda; 1846afb2bd1cSJunchao Zhang } 1847afb2bd1cSJunchao Zhang 1848afb2bd1cSJunchao Zhang if (!mat->matDescr) { 1849afb2bd1cSJunchao Zhang stat = cusparseCreateCsr(&mat->matDescr, 1850afb2bd1cSJunchao Zhang csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, 1851afb2bd1cSJunchao Zhang csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), 1852afb2bd1cSJunchao Zhang csrmat->values->data().get(), 1853afb2bd1cSJunchao Zhang CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 1854afb2bd1cSJunchao Zhang CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat); 1855afb2bd1cSJunchao Zhang } 1856afb2bd1cSJunchao Zhang stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one, 1857afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1858afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 1859afb2bd1cSJunchao Zhang cusp->spmmAlg,&mmdata->spmmBufferSize);CHKERRCUSPARSE(stat); 1860afb2bd1cSJunchao Zhang if (mmdata->spmmBuffer) {cerr = cudaFree(mmdata->spmmBuffer);CHKERRCUDA(cerr);} 1861afb2bd1cSJunchao Zhang cerr = cudaMalloc(&mmdata->spmmBuffer,mmdata->spmmBufferSize);CHKERRCUDA(cerr); 1862afb2bd1cSJunchao Zhang mmdata->initialized = PETSC_TRUE; 1863afb2bd1cSJunchao Zhang } else { 1864afb2bd1cSJunchao Zhang /* to be safe, always update pointers of the mats */ 1865afb2bd1cSJunchao Zhang stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat); 1866afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat); 1867afb2bd1cSJunchao Zhang stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat); 1868afb2bd1cSJunchao Zhang } 1869afb2bd1cSJunchao Zhang 1870afb2bd1cSJunchao Zhang /* do cusparseSpMM, which supports transpose on B */ 1871afb2bd1cSJunchao Zhang stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one, 1872afb2bd1cSJunchao Zhang mat->matDescr,mmdata->matBDescr,mat->beta_zero, 1873afb2bd1cSJunchao Zhang mmdata->matCDescr,cusparse_scalartype, 1874afb2bd1cSJunchao Zhang cusp->spmmAlg,mmdata->spmmBuffer);CHKERRCUSPARSE(stat); 1875afb2bd1cSJunchao Zhang #else 1876afb2bd1cSJunchao Zhang PetscInt k; 1877afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B */ 1878ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1879ccdfe979SStefano Zampini cublasHandle_t cublasv2handle; 1880ccdfe979SStefano Zampini cublasStatus_t cerr; 1881ccdfe979SStefano Zampini 1882ccdfe979SStefano Zampini ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr); 1883ccdfe979SStefano Zampini cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T, 1884ccdfe979SStefano Zampini B->cmap->n,B->rmap->n, 1885ccdfe979SStefano Zampini &PETSC_CUSPARSE_ONE ,barray,blda, 1886ccdfe979SStefano Zampini &PETSC_CUSPARSE_ZERO,barray,blda, 1887ccdfe979SStefano Zampini mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr); 1888ccdfe979SStefano Zampini blda = B->cmap->n; 1889afb2bd1cSJunchao Zhang k = B->cmap->n; 1890afb2bd1cSJunchao Zhang } else { 1891afb2bd1cSJunchao Zhang k = B->rmap->n; 1892ccdfe979SStefano Zampini } 1893ccdfe979SStefano Zampini 1894afb2bd1cSJunchao Zhang /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 1895ccdfe979SStefano Zampini stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k, 1896afb2bd1cSJunchao Zhang csrmat->num_entries,mat->alpha_one,mat->descr, 1897ccdfe979SStefano Zampini csrmat->values->data().get(), 1898ccdfe979SStefano Zampini csrmat->row_offsets->data().get(), 1899ccdfe979SStefano Zampini csrmat->column_indices->data().get(), 1900ccdfe979SStefano Zampini mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero, 1901ccdfe979SStefano Zampini carray,clda);CHKERRCUSPARSE(stat); 1902afb2bd1cSJunchao Zhang #endif 1903afb2bd1cSJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 1904c8378d12SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1905c8378d12SStefano Zampini ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr); 1906ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr); 1907ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { 1908ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1909ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 1910ccdfe979SStefano Zampini } else if (product->type == MATPRODUCT_PtAP) { 1911ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr); 1912ccdfe979SStefano Zampini ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 1913ccdfe979SStefano Zampini } else { 1914ccdfe979SStefano Zampini ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr); 1915ccdfe979SStefano Zampini } 1916ccdfe979SStefano Zampini if (mmdata->cisdense) { 1917ccdfe979SStefano Zampini ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr); 1918ccdfe979SStefano Zampini } 1919ccdfe979SStefano Zampini if (!biscuda) { 1920ccdfe979SStefano Zampini ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 1921ccdfe979SStefano Zampini } 1922ccdfe979SStefano Zampini PetscFunctionReturn(0); 1923ccdfe979SStefano Zampini } 1924ccdfe979SStefano Zampini 1925ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 1926ccdfe979SStefano Zampini { 1927ccdfe979SStefano Zampini Mat_Product *product = C->product; 1928ccdfe979SStefano Zampini Mat A,B; 1929ccdfe979SStefano Zampini PetscInt m,n; 1930ccdfe979SStefano Zampini PetscBool cisdense,flg; 1931ccdfe979SStefano Zampini PetscErrorCode ierr; 1932ccdfe979SStefano Zampini MatMatCusparse *mmdata; 1933ccdfe979SStefano Zampini Mat_SeqAIJCUSPARSE *cusp; 1934ccdfe979SStefano Zampini 1935ccdfe979SStefano Zampini PetscFunctionBegin; 1936ccdfe979SStefano Zampini MatCheckProduct(C,1); 1937ccdfe979SStefano Zampini if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1938ccdfe979SStefano Zampini A = product->A; 1939ccdfe979SStefano Zampini B = product->B; 1940ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr); 1941ccdfe979SStefano Zampini if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name); 1942ccdfe979SStefano Zampini cusp = (Mat_SeqAIJCUSPARSE*)A->spptr; 1943ccdfe979SStefano Zampini if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format"); 1944ccdfe979SStefano Zampini switch (product->type) { 1945ccdfe979SStefano Zampini case MATPRODUCT_AB: 1946ccdfe979SStefano Zampini m = A->rmap->n; 1947ccdfe979SStefano Zampini n = B->cmap->n; 1948ccdfe979SStefano Zampini break; 1949ccdfe979SStefano Zampini case MATPRODUCT_AtB: 1950ccdfe979SStefano Zampini m = A->cmap->n; 1951ccdfe979SStefano Zampini n = B->cmap->n; 1952ccdfe979SStefano Zampini break; 1953ccdfe979SStefano Zampini case MATPRODUCT_ABt: 1954ccdfe979SStefano Zampini m = A->rmap->n; 1955ccdfe979SStefano Zampini n = B->rmap->n; 1956ccdfe979SStefano Zampini break; 1957ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 1958ccdfe979SStefano Zampini m = B->cmap->n; 1959ccdfe979SStefano Zampini n = B->cmap->n; 1960ccdfe979SStefano Zampini break; 1961ccdfe979SStefano Zampini case MATPRODUCT_RARt: 1962ccdfe979SStefano Zampini m = B->rmap->n; 1963ccdfe979SStefano Zampini n = B->rmap->n; 1964ccdfe979SStefano Zampini break; 1965ccdfe979SStefano Zampini default: 1966ccdfe979SStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 1967ccdfe979SStefano Zampini } 1968ccdfe979SStefano Zampini ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr); 1969ccdfe979SStefano Zampini /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 1970ccdfe979SStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr); 1971ccdfe979SStefano Zampini ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr); 1972ccdfe979SStefano Zampini 1973ccdfe979SStefano Zampini /* product data */ 1974ccdfe979SStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 1975ccdfe979SStefano Zampini mmdata->cisdense = cisdense; 1976afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11,0,0) 1977afb2bd1cSJunchao Zhang /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 1978ccdfe979SStefano Zampini if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 1979afb2bd1cSJunchao Zhang cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr); 1980ccdfe979SStefano Zampini } 1981afb2bd1cSJunchao Zhang #endif 1982ccdfe979SStefano Zampini /* for these products we need intermediate storage */ 1983ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 1984ccdfe979SStefano Zampini ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr); 1985ccdfe979SStefano Zampini ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr); 1986ccdfe979SStefano Zampini if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 1987ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr); 1988ccdfe979SStefano Zampini } else { 1989ccdfe979SStefano Zampini ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr); 1990ccdfe979SStefano Zampini } 1991ccdfe979SStefano Zampini } 1992ccdfe979SStefano Zampini C->product->data = mmdata; 1993ccdfe979SStefano Zampini C->product->destroy = MatDestroy_MatMatCusparse; 1994ccdfe979SStefano Zampini 1995ccdfe979SStefano Zampini C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 1996ccdfe979SStefano Zampini PetscFunctionReturn(0); 1997ccdfe979SStefano Zampini } 1998ccdfe979SStefano Zampini 1999ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 2000ccdfe979SStefano Zampini 2001ccdfe979SStefano Zampini /* handles dense B */ 2002ccdfe979SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat C) 2003ccdfe979SStefano Zampini { 2004ccdfe979SStefano Zampini Mat_Product *product = C->product; 2005ccdfe979SStefano Zampini PetscErrorCode ierr; 2006ccdfe979SStefano Zampini 2007ccdfe979SStefano Zampini PetscFunctionBegin; 2008ccdfe979SStefano Zampini MatCheckProduct(C,1); 2009ccdfe979SStefano Zampini if (!product->A) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing A"); 2010ccdfe979SStefano Zampini if (product->A->boundtocpu) { 2011ccdfe979SStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(C);CHKERRQ(ierr); 2012ccdfe979SStefano Zampini PetscFunctionReturn(0); 2013ccdfe979SStefano Zampini } 2014ccdfe979SStefano Zampini switch (product->type) { 2015ccdfe979SStefano Zampini case MATPRODUCT_AB: 2016ccdfe979SStefano Zampini case MATPRODUCT_AtB: 2017ccdfe979SStefano Zampini case MATPRODUCT_ABt: 2018ccdfe979SStefano Zampini case MATPRODUCT_PtAP: 2019ccdfe979SStefano Zampini case MATPRODUCT_RARt: 2020ccdfe979SStefano Zampini C->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 2021ccdfe979SStefano Zampini default: 2022ccdfe979SStefano Zampini break; 2023ccdfe979SStefano Zampini } 2024ccdfe979SStefano Zampini PetscFunctionReturn(0); 2025ccdfe979SStefano Zampini } 2026ccdfe979SStefano Zampini 20276fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 20289ae82921SPaul Mullowney { 2029b175d8bbSPaul Mullowney PetscErrorCode ierr; 20309ae82921SPaul Mullowney 20319ae82921SPaul Mullowney PetscFunctionBegin; 2032e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2033e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2034e6e9a74fSStefano Zampini } 2035e6e9a74fSStefano Zampini 2036e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz) 2037e6e9a74fSStefano Zampini { 2038e6e9a74fSStefano Zampini PetscErrorCode ierr; 2039e6e9a74fSStefano Zampini 2040e6e9a74fSStefano Zampini PetscFunctionBegin; 2041e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr); 2042e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2043e6e9a74fSStefano Zampini } 2044e6e9a74fSStefano Zampini 2045e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2046e6e9a74fSStefano Zampini { 2047e6e9a74fSStefano Zampini PetscErrorCode ierr; 2048e6e9a74fSStefano Zampini 2049e6e9a74fSStefano Zampini PetscFunctionBegin; 2050e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 2051e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2052e6e9a74fSStefano Zampini } 2053e6e9a74fSStefano Zampini 2054e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2055e6e9a74fSStefano Zampini { 2056e6e9a74fSStefano Zampini PetscErrorCode ierr; 2057e6e9a74fSStefano Zampini 2058e6e9a74fSStefano Zampini PetscFunctionBegin; 2059e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr); 20609ae82921SPaul Mullowney PetscFunctionReturn(0); 20619ae82921SPaul Mullowney } 20629ae82921SPaul Mullowney 20636fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2064ca45077fSPaul Mullowney { 2065b175d8bbSPaul Mullowney PetscErrorCode ierr; 2066ca45077fSPaul Mullowney 2067ca45077fSPaul Mullowney PetscFunctionBegin; 2068e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2069ca45077fSPaul Mullowney PetscFunctionReturn(0); 2070ca45077fSPaul Mullowney } 2071ca45077fSPaul Mullowney 2072afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 2073e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm) 20749ae82921SPaul Mullowney { 20759ae82921SPaul Mullowney Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2076aa372e3fSPaul Mullowney Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 20779ff858a8SKarl Rupp Mat_SeqAIJCUSPARSEMultStruct *matstruct; 2078e6e9a74fSStefano Zampini PetscScalar *xarray,*zarray,*dptr,*beta,*xptr; 2079b175d8bbSPaul Mullowney PetscErrorCode ierr; 208057d48284SJunchao Zhang cudaError_t cerr; 2081aa372e3fSPaul Mullowney cusparseStatus_t stat; 2082e6e9a74fSStefano Zampini cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2083e6e9a74fSStefano Zampini PetscBool compressed; 2084afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2085afb2bd1cSJunchao Zhang PetscInt nx,ny; 2086afb2bd1cSJunchao Zhang #endif 20876e111a19SKarl Rupp 20889ae82921SPaul Mullowney PetscFunctionBegin; 2089e6e9a74fSStefano Zampini if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported"); 2090e6e9a74fSStefano Zampini if (!a->nonzerorowcnt) { 2091afb2bd1cSJunchao Zhang if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);} 2092e6e9a74fSStefano Zampini PetscFunctionReturn(0); 2093e6e9a74fSStefano Zampini } 209434d6c7a5SJose E. Roman /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 209534d6c7a5SJose E. Roman ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2096e6e9a74fSStefano Zampini if (!trans) { 20979ff858a8SKarl Rupp matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2098e6e9a74fSStefano Zampini } else { 2099e6e9a74fSStefano Zampini if (herm || !cusparsestruct->transgen) { 2100e6e9a74fSStefano Zampini opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 2101e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 2102e6e9a74fSStefano Zampini } else { 2103afb2bd1cSJunchao Zhang if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);} 2104e6e9a74fSStefano Zampini matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose; 2105e6e9a74fSStefano Zampini } 2106e6e9a74fSStefano Zampini } 2107e6e9a74fSStefano Zampini /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 2108e6e9a74fSStefano Zampini compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 2109213423ffSJunchao Zhang 2110e6e9a74fSStefano Zampini try { 2111e6e9a74fSStefano Zampini ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2112213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */ 2113213423ffSJunchao Zhang else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */ 2114afb2bd1cSJunchao Zhang 211585ba7357SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2116e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2117afb2bd1cSJunchao Zhang /* z = A x + beta y. 2118afb2bd1cSJunchao Zhang If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 2119afb2bd1cSJunchao Zhang When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 2120afb2bd1cSJunchao Zhang */ 2121e6e9a74fSStefano Zampini xptr = xarray; 2122afb2bd1cSJunchao Zhang dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 2123213423ffSJunchao Zhang beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 2124afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2125afb2bd1cSJunchao Zhang /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 2126afb2bd1cSJunchao Zhang allocated to accommodate different uses. So we get the length info directly from mat. 2127afb2bd1cSJunchao Zhang */ 2128afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2129afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2130afb2bd1cSJunchao Zhang nx = mat->num_cols; 2131afb2bd1cSJunchao Zhang ny = mat->num_rows; 2132afb2bd1cSJunchao Zhang } 2133afb2bd1cSJunchao Zhang #endif 2134e6e9a74fSStefano Zampini } else { 2135afb2bd1cSJunchao Zhang /* z = A^T x + beta y 2136afb2bd1cSJunchao Zhang If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 2137afb2bd1cSJunchao Zhang Note A^Tx is of full length, so we set beta to 1.0 if y exists. 2138afb2bd1cSJunchao Zhang */ 2139afb2bd1cSJunchao Zhang xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 2140e6e9a74fSStefano Zampini dptr = zarray; 2141e6e9a74fSStefano Zampini beta = yy ? matstruct->beta_one : matstruct->beta_zero; 2142afb2bd1cSJunchao Zhang if (compressed) { /* Scatter x to work vector */ 2143e6e9a74fSStefano Zampini thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 2144e6e9a74fSStefano Zampini thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 2145e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2146e6e9a74fSStefano Zampini VecCUDAEqualsReverse()); 2147e6e9a74fSStefano Zampini } 2148afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2149afb2bd1cSJunchao Zhang if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2150afb2bd1cSJunchao Zhang CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2151afb2bd1cSJunchao Zhang nx = mat->num_rows; 2152afb2bd1cSJunchao Zhang ny = mat->num_cols; 2153afb2bd1cSJunchao Zhang } 2154afb2bd1cSJunchao Zhang #endif 2155e6e9a74fSStefano Zampini } 21569ae82921SPaul Mullowney 2157afb2bd1cSJunchao Zhang /* csr_spmv does y = alpha op(A) x + beta y */ 2158aa372e3fSPaul Mullowney if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2159afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2160afb2bd1cSJunchao Zhang if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 2161afb2bd1cSJunchao Zhang if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 2162afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2163afb2bd1cSJunchao Zhang stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat); 2164afb2bd1cSJunchao Zhang stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, 2165afb2bd1cSJunchao Zhang matstruct->matDescr, 2166afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, beta, 2167afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2168afb2bd1cSJunchao Zhang cusparse_scalartype, 2169afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2170afb2bd1cSJunchao Zhang &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat); 2171afb2bd1cSJunchao Zhang cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr); 2172afb2bd1cSJunchao Zhang 2173afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 2174afb2bd1cSJunchao Zhang } else { 2175afb2bd1cSJunchao Zhang /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 2176afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat); 2177afb2bd1cSJunchao Zhang stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat); 2178afb2bd1cSJunchao Zhang } 2179afb2bd1cSJunchao Zhang 2180afb2bd1cSJunchao Zhang stat = cusparseSpMV(cusparsestruct->handle, opA, 2181afb2bd1cSJunchao Zhang matstruct->alpha_one, 2182afb2bd1cSJunchao Zhang matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */ 2183afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecXDescr, 2184afb2bd1cSJunchao Zhang beta, 2185afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].vecYDescr, 2186afb2bd1cSJunchao Zhang cusparse_scalartype, 2187afb2bd1cSJunchao Zhang cusparsestruct->spmvAlg, 2188afb2bd1cSJunchao Zhang matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat); 2189afb2bd1cSJunchao Zhang #else 21907656d835SStefano Zampini CsrMatrix *mat = (CsrMatrix*)matstruct->mat; 2191e6e9a74fSStefano Zampini stat = cusparse_csr_spmv(cusparsestruct->handle, opA, 2192a65300a6SPaul Mullowney mat->num_rows, mat->num_cols, 2193afb2bd1cSJunchao Zhang mat->num_entries, matstruct->alpha_one, matstruct->descr, 2194aa372e3fSPaul Mullowney mat->values->data().get(), mat->row_offsets->data().get(), 2195e6e9a74fSStefano Zampini mat->column_indices->data().get(), xptr, beta, 219657d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2197afb2bd1cSJunchao Zhang #endif 2198aa372e3fSPaul Mullowney } else { 2199213423ffSJunchao Zhang if (cusparsestruct->nrows) { 2200afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2201afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2202afb2bd1cSJunchao Zhang #else 2203301298b4SMark Adams cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 2204e6e9a74fSStefano Zampini stat = cusparse_hyb_spmv(cusparsestruct->handle, opA, 2205afb2bd1cSJunchao Zhang matstruct->alpha_one, matstruct->descr, hybMat, 2206e6e9a74fSStefano Zampini xptr, beta, 220757d48284SJunchao Zhang dptr);CHKERRCUSPARSE(stat); 2208afb2bd1cSJunchao Zhang #endif 2209a65300a6SPaul Mullowney } 2210aa372e3fSPaul Mullowney } 221105035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2212958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2213aa372e3fSPaul Mullowney 2214e6e9a74fSStefano Zampini if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 2215213423ffSJunchao Zhang if (yy) { /* MatMultAdd: zz = A*xx + yy */ 2216213423ffSJunchao Zhang if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 2217213423ffSJunchao Zhang ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */ 2218e6e9a74fSStefano Zampini } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 2219213423ffSJunchao Zhang ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 22207656d835SStefano Zampini } 2221213423ffSJunchao Zhang } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 2222c1fb3f03SStefano Zampini ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr); 22237656d835SStefano Zampini } 22247656d835SStefano Zampini 2225213423ffSJunchao Zhang /* ScatterAdd the result from work vector into the full vector when A is compressed */ 2226213423ffSJunchao Zhang if (compressed) { 2227213423ffSJunchao Zhang thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray); 2228e6e9a74fSStefano Zampini 2229e6e9a74fSStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2230c41cb2e2SAlejandro Lamas Daviña thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))), 2231e6e9a74fSStefano Zampini thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), 2232c41cb2e2SAlejandro Lamas Daviña VecCUDAPlusEquals()); 223305035670SJunchao Zhang cerr = WaitForCUDA();CHKERRCUDA(cerr); 2234958c4211Shannah_mairs ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2235e6e9a74fSStefano Zampini } 2236e6e9a74fSStefano Zampini } else { 2237e6e9a74fSStefano Zampini if (yy && yy != zz) { 2238e6e9a74fSStefano Zampini ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */ 2239e6e9a74fSStefano Zampini } 2240e6e9a74fSStefano Zampini } 2241e6e9a74fSStefano Zampini ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr); 2242213423ffSJunchao Zhang if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);} 2243213423ffSJunchao Zhang else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);} 22449ae82921SPaul Mullowney } catch(char *ex) { 22459ae82921SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex); 22469ae82921SPaul Mullowney } 2247e6e9a74fSStefano Zampini if (yy) { 2248958c4211Shannah_mairs ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr); 2249e6e9a74fSStefano Zampini } else { 2250e6e9a74fSStefano Zampini ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr); 2251e6e9a74fSStefano Zampini } 22529ae82921SPaul Mullowney PetscFunctionReturn(0); 22539ae82921SPaul Mullowney } 22549ae82921SPaul Mullowney 22556fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 2256ca45077fSPaul Mullowney { 2257b175d8bbSPaul Mullowney PetscErrorCode ierr; 22586e111a19SKarl Rupp 2259ca45077fSPaul Mullowney PetscFunctionBegin; 2260e6e9a74fSStefano Zampini ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr); 2261ca45077fSPaul Mullowney PetscFunctionReturn(0); 2262ca45077fSPaul Mullowney } 2263ca45077fSPaul Mullowney 22646fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode) 22659ae82921SPaul Mullowney { 22669ae82921SPaul Mullowney PetscErrorCode ierr; 2267*3fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL, h_mat; 2268*3fa6b06aSMark Adams PetscBool is_seq = PETSC_TRUE; 2269*3fa6b06aSMark Adams PetscInt nnz_state = A->nonzerostate; 22709ae82921SPaul Mullowney PetscFunctionBegin; 2271bc3f50f2SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 2272*3fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2273bc3f50f2SPaul Mullowney } 2274*3fa6b06aSMark Adams if (d_mat) { 2275*3fa6b06aSMark Adams cudaError_t err; 2276*3fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble device matrix\n");CHKERRQ(ierr); 2277*3fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2278*3fa6b06aSMark Adams nnz_state = h_mat.nonzerostate; 2279*3fa6b06aSMark Adams is_seq = h_mat.seq; 2280*3fa6b06aSMark Adams } 2281*3fa6b06aSMark Adams ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it? 2282*3fa6b06aSMark Adams if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0); 2283*3fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE && A->nonzerostate >= nnz_state && is_seq) { // assembled on CPU eventhough equiped for GPU 2284*3fa6b06aSMark Adams ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 2285*3fa6b06aSMark Adams } else if (nnz_state > A->nonzerostate) { 2286*3fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; 2287*3fa6b06aSMark Adams } 2288*3fa6b06aSMark Adams 22899ae82921SPaul Mullowney PetscFunctionReturn(0); 22909ae82921SPaul Mullowney } 22919ae82921SPaul Mullowney 22929ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/ 2293e057df02SPaul Mullowney /*@ 22949ae82921SPaul Mullowney MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 2295e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2296e057df02SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 2297e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 2298e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 2299e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 23009ae82921SPaul Mullowney 2301d083f849SBarry Smith Collective 23029ae82921SPaul Mullowney 23039ae82921SPaul Mullowney Input Parameters: 23049ae82921SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 23059ae82921SPaul Mullowney . m - number of rows 23069ae82921SPaul Mullowney . n - number of columns 23079ae82921SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 23089ae82921SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 23090298fd71SBarry Smith (possibly different for each row) or NULL 23109ae82921SPaul Mullowney 23119ae82921SPaul Mullowney Output Parameter: 23129ae82921SPaul Mullowney . A - the matrix 23139ae82921SPaul Mullowney 23149ae82921SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 23159ae82921SPaul Mullowney MatXXXXSetPreallocation() paradgm instead of this routine directly. 23169ae82921SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 23179ae82921SPaul Mullowney 23189ae82921SPaul Mullowney Notes: 23199ae82921SPaul Mullowney If nnz is given then nz is ignored 23209ae82921SPaul Mullowney 23219ae82921SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 23229ae82921SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 23239ae82921SPaul Mullowney storage. That is, the stored row and column indices can begin at 23249ae82921SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 23259ae82921SPaul Mullowney 23269ae82921SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 23270298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 23289ae82921SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 23299ae82921SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 23309ae82921SPaul Mullowney 23319ae82921SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 23329ae82921SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 23339ae82921SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 23349ae82921SPaul Mullowney reusing matrix information to achieve increased efficiency. 23359ae82921SPaul Mullowney 23369ae82921SPaul Mullowney Level: intermediate 23379ae82921SPaul Mullowney 2338e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE 23399ae82921SPaul Mullowney @*/ 23409ae82921SPaul Mullowney PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 23419ae82921SPaul Mullowney { 23429ae82921SPaul Mullowney PetscErrorCode ierr; 23439ae82921SPaul Mullowney 23449ae82921SPaul Mullowney PetscFunctionBegin; 23459ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 23469ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 23479ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 23489ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 23499ae82921SPaul Mullowney PetscFunctionReturn(0); 23509ae82921SPaul Mullowney } 23519ae82921SPaul Mullowney 23526fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 23539ae82921SPaul Mullowney { 23549ae82921SPaul Mullowney PetscErrorCode ierr; 2355*3fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 2356ab25e6cbSDominic Meiser 23579ae82921SPaul Mullowney PetscFunctionBegin; 23589ae82921SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 2359*3fa6b06aSMark Adams d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat; 2360*3fa6b06aSMark Adams ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL; 2361470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr); 23629ae82921SPaul Mullowney } else { 2363470880abSPatrick Sanan ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr); 2364aa372e3fSPaul Mullowney } 2365*3fa6b06aSMark Adams if (d_mat) { 2366*3fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2367*3fa6b06aSMark Adams cudaError_t err; 2368*3fa6b06aSMark Adams PetscSplitCSRDataStructure h_mat; 2369*3fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2370*3fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2371*3fa6b06aSMark Adams if (h_mat.seq) { 2372*3fa6b06aSMark Adams if (a->compressedrow.use) { 2373*3fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2374*3fa6b06aSMark Adams } 2375*3fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 2376*3fa6b06aSMark Adams } 2377*3fa6b06aSMark Adams } 2378ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 2379ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2380ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 2381ccdfe979SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 23829ae82921SPaul Mullowney ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 23839ae82921SPaul Mullowney PetscFunctionReturn(0); 23849ae82921SPaul Mullowney } 23859ae82921SPaul Mullowney 2386ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*); 238795639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool); 23889ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B) 23899ff858a8SKarl Rupp { 23909ff858a8SKarl Rupp PetscErrorCode ierr; 23919ff858a8SKarl Rupp 23929ff858a8SKarl Rupp PetscFunctionBegin; 23939ff858a8SKarl Rupp ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr); 2394ccdfe979SStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr); 23959ff858a8SKarl Rupp PetscFunctionReturn(0); 23969ff858a8SKarl Rupp } 23979ff858a8SKarl Rupp 239895639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg) 239995639643SRichard Tran Mills { 2400e6e9a74fSStefano Zampini PetscErrorCode ierr; 2401e6e9a74fSStefano Zampini 240295639643SRichard Tran Mills PetscFunctionBegin; 2403e6e9a74fSStefano Zampini if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0); 2404c34f1ff0SRichard Tran Mills /* Currently, there is no case in which an AIJCUSPARSE matrix ever has its offloadmask set to PETS_OFFLOAD_GPU. 240595639643SRichard Tran Mills If this changes, we need to implement a routine to update the CPU (host) version of the matrix from the GPU one. 2406e6e9a74fSStefano Zampini Right now, for safety we simply check for PETSC_OFFLOAD_GPU and have MatBindToCPU() error in this case. 240795639643SRichard Tran Mills TODO: Add MatAIJCUSPARSECopyFromGPU() and make MatBindToCPU() functional for AIJCUSPARSE matries; 240895639643SRichard Tran Mills can follow the example of MatBindToCPU_SeqAIJViennaCL(). */ 2409e6e9a74fSStefano Zampini if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"PETSC_OFFLOAD_GPU should not happen. Please report your use case to petsc-dev@mcs.anl.gov"); 2410e6e9a74fSStefano Zampini /* TODO: add support for this? */ 241195639643SRichard Tran Mills if (flg) { 241295639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJ; 241395639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJ; 2414c34f1ff0SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJ; 2415c34f1ff0SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 2416e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = NULL; 2417e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = NULL; 2418e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr); 2419e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr); 242095639643SRichard Tran Mills } else { 242195639643SRichard Tran Mills A->ops->mult = MatMult_SeqAIJCUSPARSE; 242295639643SRichard Tran Mills A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 242395639643SRichard Tran Mills A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 242495639643SRichard Tran Mills A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 2425e6e9a74fSStefano Zampini A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 2426e6e9a74fSStefano Zampini A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 2427e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 2428e6e9a74fSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr); 242995639643SRichard Tran Mills } 243095639643SRichard Tran Mills A->boundtocpu = flg; 243195639643SRichard Tran Mills PetscFunctionReturn(0); 243295639643SRichard Tran Mills } 243395639643SRichard Tran Mills 2434*3fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 2435*3fa6b06aSMark Adams { 2436*3fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = NULL; 2437*3fa6b06aSMark Adams PetscErrorCode ierr; 2438*3fa6b06aSMark Adams PetscFunctionBegin; 2439*3fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 2440*3fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr; 2441*3fa6b06aSMark Adams d_mat = spptr->deviceMat; 2442*3fa6b06aSMark Adams } 2443*3fa6b06aSMark Adams if (d_mat) { 2444*3fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2445*3fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz = a->i[n]; 2446*3fa6b06aSMark Adams cudaError_t err; 2447*3fa6b06aSMark Adams PetscScalar *vals; 2448*3fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix\n");CHKERRQ(ierr); 2449*3fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2450*3fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnz)*sizeof(PetscScalar));CHKERRCUDA(err); 2451*3fa6b06aSMark Adams } 2452*3fa6b06aSMark Adams ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 2453*3fa6b06aSMark Adams 2454*3fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_BOTH; 2455*3fa6b06aSMark Adams 2456*3fa6b06aSMark Adams PetscFunctionReturn(0); 2457*3fa6b06aSMark Adams } 2458*3fa6b06aSMark Adams 245949735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 24609ae82921SPaul Mullowney { 24619ae82921SPaul Mullowney PetscErrorCode ierr; 2462aa372e3fSPaul Mullowney cusparseStatus_t stat; 246349735bf3SStefano Zampini Mat B; 24649ae82921SPaul Mullowney 24659ae82921SPaul Mullowney PetscFunctionBegin; 246649735bf3SStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 246749735bf3SStefano Zampini ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 246849735bf3SStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 246949735bf3SStefano Zampini ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 247049735bf3SStefano Zampini } 247149735bf3SStefano Zampini B = *newmat; 247249735bf3SStefano Zampini 247334136279SStefano Zampini ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr); 247434136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr); 247534136279SStefano Zampini 247649735bf3SStefano Zampini if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 24779ae82921SPaul Mullowney if (B->factortype == MAT_FACTOR_NONE) { 2478e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSE *spptr; 2479e6e9a74fSStefano Zampini 2480e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2481e6e9a74fSStefano Zampini spptr->format = MAT_CUSPARSE_CSR; 2482e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2483e6e9a74fSStefano Zampini B->spptr = spptr; 2484*3fa6b06aSMark Adams spptr->deviceMat = NULL; 24859ae82921SPaul Mullowney } else { 2486e6e9a74fSStefano Zampini Mat_SeqAIJCUSPARSETriFactors *spptr; 2487e6e9a74fSStefano Zampini 2488e6e9a74fSStefano Zampini ierr = PetscNew(&spptr);CHKERRQ(ierr); 2489e6e9a74fSStefano Zampini stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat); 2490e6e9a74fSStefano Zampini B->spptr = spptr; 24919ae82921SPaul Mullowney } 2492e6e9a74fSStefano Zampini B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 249349735bf3SStefano Zampini } 2494693b0035SStefano Zampini B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 24959ae82921SPaul Mullowney B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 24969ae82921SPaul Mullowney B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 249795639643SRichard Tran Mills B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 2498693b0035SStefano Zampini B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 2499*3fa6b06aSMark Adams B->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 25002205254eSKarl Rupp 2501e6e9a74fSStefano Zampini ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr); 25029ae82921SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2503bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr); 25049ae82921SPaul Mullowney PetscFunctionReturn(0); 25059ae82921SPaul Mullowney } 25069ae82921SPaul Mullowney 250702fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 250802fe1965SBarry Smith { 250902fe1965SBarry Smith PetscErrorCode ierr; 251002fe1965SBarry Smith 251102fe1965SBarry Smith PetscFunctionBegin; 251205035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 251302fe1965SBarry Smith ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr); 25140ce8acdeSStefano Zampini ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr); 2515afb2bd1cSJunchao Zhang ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr); 2516afb2bd1cSJunchao Zhang ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr); 2517afb2bd1cSJunchao Zhang ierr = PetscOptionsEnd();CHKERRQ(ierr); 251802fe1965SBarry Smith PetscFunctionReturn(0); 251902fe1965SBarry Smith } 252002fe1965SBarry Smith 25213ca39a21SBarry Smith /*MC 2522e057df02SPaul Mullowney MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices. 2523e057df02SPaul Mullowney 2524e057df02SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 25252692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 25262692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 2527e057df02SPaul Mullowney 2528e057df02SPaul Mullowney Options Database Keys: 2529e057df02SPaul Mullowney + -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions() 2530aa372e3fSPaul Mullowney . -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2531a2b725a8SWilliam Gropp - -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 2532e057df02SPaul Mullowney 2533e057df02SPaul Mullowney Level: beginner 2534e057df02SPaul Mullowney 25358468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 2536e057df02SPaul Mullowney M*/ 25377f756511SDominic Meiser 253842c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*); 253942c9c57cSBarry Smith 25400f39cd5aSBarry Smith 25413ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 254242c9c57cSBarry Smith { 254342c9c57cSBarry Smith PetscErrorCode ierr; 254442c9c57cSBarry Smith 254542c9c57cSBarry Smith PetscFunctionBegin; 25463ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 25473ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 25483ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 25493ca39a21SBarry Smith ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr); 255042c9c57cSBarry Smith PetscFunctionReturn(0); 255142c9c57cSBarry Smith } 255229b38603SBarry Smith 2553470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct) 25547f756511SDominic Meiser { 2555e6e9a74fSStefano Zampini PetscErrorCode ierr; 25567f756511SDominic Meiser cusparseStatus_t stat; 25577f756511SDominic Meiser cusparseHandle_t handle; 25587f756511SDominic Meiser 25597f756511SDominic Meiser PetscFunctionBegin; 25607f756511SDominic Meiser if (*cusparsestruct) { 2561e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr); 2562e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr); 25637f756511SDominic Meiser delete (*cusparsestruct)->workVector; 256481902715SJunchao Zhang delete (*cusparsestruct)->rowoffsets_gpu; 2565afb2bd1cSJunchao Zhang if (handle = (*cusparsestruct)->handle) {stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);} 2566afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2567afb2bd1cSJunchao Zhang cudaError_t cerr = cudaFree((*cusparsestruct)->csr2cscBuffer);CHKERRCUDA(cerr); 2568afb2bd1cSJunchao Zhang #endif 2569e6e9a74fSStefano Zampini ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr); 25707f756511SDominic Meiser } 25717f756511SDominic Meiser PetscFunctionReturn(0); 25727f756511SDominic Meiser } 25737f756511SDominic Meiser 25747f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 25757f756511SDominic Meiser { 25767f756511SDominic Meiser PetscFunctionBegin; 25777f756511SDominic Meiser if (*mat) { 25787f756511SDominic Meiser delete (*mat)->values; 25797f756511SDominic Meiser delete (*mat)->column_indices; 25807f756511SDominic Meiser delete (*mat)->row_offsets; 25817f756511SDominic Meiser delete *mat; 25827f756511SDominic Meiser *mat = 0; 25837f756511SDominic Meiser } 25847f756511SDominic Meiser PetscFunctionReturn(0); 25857f756511SDominic Meiser } 25867f756511SDominic Meiser 2587470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 25887f756511SDominic Meiser { 25897f756511SDominic Meiser cusparseStatus_t stat; 25907f756511SDominic Meiser PetscErrorCode ierr; 25917f756511SDominic Meiser 25927f756511SDominic Meiser PetscFunctionBegin; 25937f756511SDominic Meiser if (*trifactor) { 259457d48284SJunchao Zhang if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); } 2595afb2bd1cSJunchao Zhang if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); } 25967f756511SDominic Meiser ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr); 2597afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2598afb2bd1cSJunchao Zhang cudaError_t cerr; 2599afb2bd1cSJunchao Zhang if ((*trifactor)->solveBuffer) {cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);} 2600afb2bd1cSJunchao Zhang if ((*trifactor)->csr2cscBuffer) {cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);} 2601afb2bd1cSJunchao Zhang #endif 26027f756511SDominic Meiser delete *trifactor; 26037f756511SDominic Meiser *trifactor = 0; 26047f756511SDominic Meiser } 26057f756511SDominic Meiser PetscFunctionReturn(0); 26067f756511SDominic Meiser } 26077f756511SDominic Meiser 2608470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format) 26097f756511SDominic Meiser { 26107f756511SDominic Meiser CsrMatrix *mat; 26117f756511SDominic Meiser cusparseStatus_t stat; 26127f756511SDominic Meiser cudaError_t err; 26137f756511SDominic Meiser 26147f756511SDominic Meiser PetscFunctionBegin; 26157f756511SDominic Meiser if (*matstruct) { 26167f756511SDominic Meiser if ((*matstruct)->mat) { 26177f756511SDominic Meiser if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) { 2618afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2619afb2bd1cSJunchao Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2620afb2bd1cSJunchao Zhang #else 26217f756511SDominic Meiser cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 262257d48284SJunchao Zhang stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat); 2623afb2bd1cSJunchao Zhang #endif 26247f756511SDominic Meiser } else { 26257f756511SDominic Meiser mat = (CsrMatrix*)(*matstruct)->mat; 26267f756511SDominic Meiser CsrMatrix_Destroy(&mat); 26277f756511SDominic Meiser } 26287f756511SDominic Meiser } 262957d48284SJunchao Zhang if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); } 26307f756511SDominic Meiser delete (*matstruct)->cprowIndices; 2631afb2bd1cSJunchao Zhang if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); } 26327656d835SStefano Zampini if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); } 26337656d835SStefano Zampini if ((*matstruct)->beta_one) { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); } 2634afb2bd1cSJunchao Zhang 2635afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 2636afb2bd1cSJunchao Zhang Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 2637afb2bd1cSJunchao Zhang if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);} 2638afb2bd1cSJunchao Zhang for (int i=0; i<3; i++) { 2639afb2bd1cSJunchao Zhang if (mdata->cuSpMV[i].initialized) { 2640afb2bd1cSJunchao Zhang err = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err); 2641afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat); 2642afb2bd1cSJunchao Zhang stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat); 2643afb2bd1cSJunchao Zhang } 2644afb2bd1cSJunchao Zhang } 2645afb2bd1cSJunchao Zhang #endif 26467f756511SDominic Meiser delete *matstruct; 26477f756511SDominic Meiser *matstruct = 0; 26487f756511SDominic Meiser } 26497f756511SDominic Meiser PetscFunctionReturn(0); 26507f756511SDominic Meiser } 26517f756511SDominic Meiser 2652ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors) 26537f756511SDominic Meiser { 2654e6e9a74fSStefano Zampini PetscErrorCode ierr; 2655e6e9a74fSStefano Zampini 26567f756511SDominic Meiser PetscFunctionBegin; 26577f756511SDominic Meiser if (*trifactors) { 2658e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr); 2659e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr); 2660e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr); 2661e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr); 26627f756511SDominic Meiser delete (*trifactors)->rpermIndices; 26637f756511SDominic Meiser delete (*trifactors)->cpermIndices; 26647f756511SDominic Meiser delete (*trifactors)->workVector; 2665ccdfe979SStefano Zampini (*trifactors)->rpermIndices = 0; 2666ccdfe979SStefano Zampini (*trifactors)->cpermIndices = 0; 2667ccdfe979SStefano Zampini (*trifactors)->workVector = 0; 2668ccdfe979SStefano Zampini } 2669ccdfe979SStefano Zampini PetscFunctionReturn(0); 2670ccdfe979SStefano Zampini } 2671ccdfe979SStefano Zampini 2672ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors) 2673ccdfe979SStefano Zampini { 2674e6e9a74fSStefano Zampini PetscErrorCode ierr; 2675ccdfe979SStefano Zampini cusparseHandle_t handle; 2676ccdfe979SStefano Zampini cusparseStatus_t stat; 2677ccdfe979SStefano Zampini 2678ccdfe979SStefano Zampini PetscFunctionBegin; 2679ccdfe979SStefano Zampini if (*trifactors) { 2680e6e9a74fSStefano Zampini ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr); 26817f756511SDominic Meiser if (handle = (*trifactors)->handle) { 268257d48284SJunchao Zhang stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat); 26837f756511SDominic Meiser } 2684e6e9a74fSStefano Zampini ierr = PetscFree(*trifactors);CHKERRQ(ierr); 26857f756511SDominic Meiser } 26867f756511SDominic Meiser PetscFunctionReturn(0); 26877f756511SDominic Meiser } 2688